From 8a764a875fe3cf3a83296bacd00bfc41917e95e2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 12 Feb 2015 20:04:39 +0100 Subject: [PATCH 001/148] x86/asm/decoder: Create artificial 3rd byte for 2-byte VEX Before this patch, users need to do this to fetch vex.vvvv: if (insn->vex_prefix.nbytes == 2) { vex_vvvv = ((insn->vex_prefix.bytes[1] >> 3) & 0xf) ^ 0xf; } if (insn->vex_prefix.nbytes == 3) { vex_vvvv = ((insn->vex_prefix.bytes[2] >> 3) & 0xf) ^ 0xf; } Make it so that insn->vex_prefix.bytes[2] always contains vex.wvvvvLpp bits. Signed-off-by: Denys Vlasenko Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Frank Ch. Eigler Cc: Jim Keniston Cc: Oleg Nesterov Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1423767879-31691-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/lib/insn.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 1313ae6b478b..5ea00ddc1825 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -164,6 +164,12 @@ found: /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { + /* + * For VEX2, fake VEX3-like byte#2. + * Makes it easier to decode vex.W, vex.vvvv, + * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. + */ + insn->vex_prefix.bytes[2] = b2 & 0x7f; insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } From cbb53b9623a70f012e1fdfb6fc0af6878df4762b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 12 Feb 2015 20:06:57 +0100 Subject: [PATCH 002/148] x86/asm/decoder: Explain CALLW discrepancy between Intel and AMD In 64-bit mode, AMD and Intel CPUs treat 0x66 prefix before branch insns differently. For near branches, it affects decode too since immediate offset's width is different. See these empirical tests: http://marc.info/?l=linux-kernel&m=139714939728946&w=2 Signed-off-by: Denys Vlasenko Cc: Masami Hiramatsu Cc: Oleg Nesterov Link: http://lkml.kernel.org/r/1423768017-31766-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/lib/x86-opcode-map.txt | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1a2be7c6895d..816488c0b97e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -273,6 +273,9 @@ dd: ESC de: ESC df: ESC # 0xe0 - 0xef +# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix +# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation +# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. e0: LOOPNE/LOOPNZ Jb (f64) e1: LOOPE/LOOPZ Jb (f64) e2: LOOP Jb (f64) @@ -281,6 +284,10 @@ e4: IN AL,Ib e5: IN eAX,Ib e6: OUT Ib,AL e7: OUT Ib,eAX +# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset +# in "near" jumps and calls is 16-bit. For CALL, +# push of return address is 16-bit wide, RSP is decremented by 2 +# but is not truncated to 16 bits, unlike RIP. e8: CALL Jz (f64) e9: JMP-near Jz (f64) ea: JMP-far Ap (i64) @@ -456,6 +463,7 @@ AVXcode: 1 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) # 0x0f 0x80-0x8f +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 80: JO Jz (f64) 81: JNO Jz (f64) 82: JB/JC/JNAE Jz (f64) @@ -842,6 +850,7 @@ EndTable GrpTable: Grp5 0: INC Ev 1: DEC Ev +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 2: CALLN Ev (f64) 3: CALLF Ep 4: JMPN Ev (f64) From 91e5ed49fca09c2b83b262b9757d1376ee2b46c3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 27 Jan 2015 16:06:02 -0800 Subject: [PATCH 003/148] x86/asm/decoder: Fix and enforce max instruction size in the insn decoder x86 instructions cannot exceed 15 bytes, and the instruction decoder should enforce that. Prior to 6ba48ff46f76, the instruction length limit was implicitly set to 16, which was an approximation of 15, but there is currently no limit at all. Fix MAX_INSN_SIZE (it should be 15, not 16), and fix the decoder to reject instructions that exceed MAX_INSN_SIZE. Other than potentially confusing some of the decoder sanity checks, I'm not aware of any actual problems that omitting this check would cause, nor am I aware of any practical problems caused by the MAX_INSN_SIZE error. Signed-off-by: Andy Lutomirski Acked-by: Masami Hiramatsu Cc: Dave Hansen Fixes: 6ba48ff46f76 ("x86: Remove arbitrary instruction size limit ... Link: http://lkml.kernel.org/r/f8f0bc9b8c58cfd6830f7d88400bf1396cbdcd0f.1422403511.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/insn.h | 2 +- arch/x86/lib/insn.c | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 47f29b1d1846..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -69,7 +69,7 @@ struct insn { const insn_byte_t *next_byte; }; -#define MAX_INSN_SIZE 16 +#define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 5ea00ddc1825..8f72b334aea0 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -52,6 +52,13 @@ */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { + /* + * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid + * even if the input buffer is long enough to hold them. + */ + if (buf_len > MAX_INSN_SIZE) + buf_len = MAX_INSN_SIZE; + memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; From 5b171e8218044d3c951d20a39512df861e349722 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Wed, 28 Jan 2015 00:16:28 +0600 Subject: [PATCH 004/148] x86/asm/boot: Fix path in comments Signed-off-by: Alexander Kuleshov Cc: Martin Mares Link: http://lkml.kernel.org/r/1422382588-10367-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index a468c0a65c42..6975f5dcc368 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -1,5 +1,5 @@ /* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit * * Copyright (C) 2000 Andrea Arcangeli SuSE * Copyright (C) 2000 Pavel Machek @@ -56,7 +56,7 @@ startup_64: * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from - * arch/x86_64/boot/compressed/head.S. + * arch/x86/boot/compressed/head_64.S. * * We only come here initially at boot nothing else comes here. * From 0cdb81bef20b1d9e12111fa6cd81f748ebd87778 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 23 Jan 2015 08:35:13 +0000 Subject: [PATCH 005/148] x86-64: Also clear _PAGE_GLOBAL from __supported_pte_mask if !cpu_has_pge Not just setting it when the feature is available is for consistency, and may allow Xen to drop its custom clearing of the flag (unless it needs it cleared earlier than this code executes). Note that the change is benign to ix86, as the flag starts out clear there. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/54C215D10200007800058912@mail.emea.novell.com Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 079c3b6a3ff1..090499a363cb 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -179,7 +179,8 @@ static void __init probe_page_size_mask(void) if (cpu_has_pge) { set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; - } + } else + __supported_pte_mask &= ~_PAGE_GLOBAL; } #ifdef CONFIG_X86_32 From fb148d83ec6924b7766731e58739d7281b6fb8c7 Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Thu, 19 Feb 2015 13:34:58 +0600 Subject: [PATCH 006/148] x86/asm/boot: Use already defined KEEP_SEGMENTS macro in head_{32,64}.S There is already defined macro KEEP_SEGMENTS in , let's use it instead of hardcoded constants. Signed-off-by: Alexander Kuleshov Cc: "H. Peter Anvin" Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1424331298-7456-1-git-send-email-kuleshovmail@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_32.S | 3 ++- arch/x86/boot/compressed/head_64.S | 3 ++- arch/x86/kernel/head_32.S | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1d7fbbcc196d..8ef964ddc18e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -29,6 +29,7 @@ #include #include #include +#include __HEAD ENTRY(startup_32) @@ -102,7 +103,7 @@ preferred_addr: * Test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 1f cli diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6b1766c6c082..90c152135e92 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -31,6 +31,7 @@ #include #include #include +#include __HEAD .code32 @@ -46,7 +47,7 @@ ENTRY(startup_32) * Test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 1f cli diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..d031bad9e07e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -22,6 +22,7 @@ #include #include #include +#include /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -90,7 +91,7 @@ ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 2f /* From 719d359dc7b6be3e43d6661f192ceb980b10ee26 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Thu, 19 Feb 2015 10:37:28 -0700 Subject: [PATCH 007/148] x86/asm: Add support for the pcommit instruction Add support for the new pcommit (persistent commit) instruction. This instruction was announced in the document "Intel Architecture Instruction Set Extensions Programming Reference" with reference number 319433-022: https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf The pcommit instruction ensures that data that has been flushed from the processor's cache hierarchy with clwb, clflushopt or clflush is accepted to memory and is durable on the DIMM. The primary use case for this is persistent memory. This function shows how to properly use clwb/clflushopt/clflush and pcommit with appropriate fencing: void flush_and_commit_buffer(void *vaddr, unsigned int size) { void *vend = vaddr + size - 1; for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) clwb(vaddr); /* Flush any possible final partial cacheline */ clwb(vend); /* * sfence to order clwb/clflushopt/clflush cache flushes * mfence via mb() also works */ wmb(); /* pcommit and the required sfence for ordering */ pcommit_sfence(); } After this function completes the data pointed to by vaddr is has been accepted to memory and will be durable if the vaddr points to persistent memory. Pcommit must always be ordered by an mfence or sfence, so to help simplify things we include both the pcommit and the required sfence in the alternatives generated by pcommit_sfence(). The other option is to keep them separated, but on platforms that don't support pcommit this would then turn into: void flush_and_commit_buffer(void *vaddr, unsigned int size) { void *vend = vaddr + size - 1; for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) clwb(vaddr); /* Flush any possible final partial cacheline */ clwb(vend); /* * sfence to order clwb/clflushopt/clflush cache flushes * mfence via mb() also works */ wmb(); nop(); /* from pcommit(), via alternatives */ /* * sfence to order pcommit * mfence via mb() also works */ wmb(); } This is still correct, but now you've got two fences separated by only a nop. With the commit and the fence together in pcommit_sfence() you avoid the final unneeded fence. Signed-off-by: Ross Zwisler Acked-by: Borislav Petkov Acked-by: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1424367448-24254-1-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/special_insns.h | 10 ++++++++++ 2 files changed, 11 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..d6428ea5d316 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -231,6 +231,7 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index e820c080a4e9..096250143832 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#include + static inline void native_clts(void) { asm volatile("clts"); @@ -199,6 +201,14 @@ static inline void clflushopt(volatile void *__p) "+m" (*(volatile char __force *)__p)); } +static inline void pcommit_sfence(void) +{ + alternative(ASM_NOP7, + ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ + "sfence", + X86_FEATURE_PCOMMIT); +} + #define nop() asm volatile ("nop") From 338ea55579d1dbd83753b10db74da747b2f1998f Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 4 Jan 2015 11:49:56 +0100 Subject: [PATCH 008/148] x86/lib/copy_user_64.S: Remove FIX_ALIGNMENT define It is unconditionally enabled so remove it. No object file change. Signed-off-by: Borislav Petkov --- arch/x86/lib/copy_user_64.S | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..1530ec2c1b12 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -8,9 +8,6 @@ #include #include - -#define FIX_ALIGNMENT 1 - #include #include #include @@ -45,7 +42,6 @@ .endm .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx @@ -67,7 +63,6 @@ _ASM_EXTABLE(100b,103b) _ASM_EXTABLE(101b,103b) -#endif .endm /* Standard copy_to_user with segment limit checking */ From db477a3386dee183130916d6bbf21f5828b0b2e2 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 30 Dec 2014 20:27:09 +0100 Subject: [PATCH 009/148] x86/alternatives: Cleanup DPRINTK macro Make it pass __func__ implicitly. Also, dump info about each replacing we're doing. Fixup comments and style while at it. Signed-off-by: Borislav Petkov --- arch/x86/kernel/alternative.c | 41 +++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..1e86e85bcf58 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -52,10 +52,10 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, ...) \ -do { \ - if (debug_alternative) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +#define DPRINTK(fmt, args...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ } while (0) /* @@ -243,12 +243,13 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void *text_poke_early(void *addr, const void *opcode, size_t len); -/* Replace instructions with better alternatives for this CPU type. - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that asymmetric systems where - APs have less capabilities than the boot processor are not handled. - Tough. Make sure you disable such features by hand. */ - +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + */ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { @@ -256,10 +257,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + DPRINTK("alt table %p -> %p", start, end); /* * The scan order should be from start to end. A later scanned - * alternative code can overwrite a previous scanned alternative code. + * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * @@ -275,11 +276,19 @@ void __init_or_module apply_alternatives(struct alt_instr *start, if (!boot_cpu_has(a->cpuid)) continue; + DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, a->instrlen, + replacement, a->replacementlen); + memcpy(insnbuf, replacement, a->replacementlen); /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += replacement - instr; + if (*insnbuf == 0xe8 && a->replacementlen == 5) { + *(s32 *)(insnbuf + 1) += replacement - instr; + DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1)); + } add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); @@ -371,8 +380,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", - __func__, smp->locks, smp->locks_end, + DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); From 4332195c5615bf748624094ce4ff6797e475024d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 27 Dec 2014 10:41:52 +0100 Subject: [PATCH 010/148] x86/alternatives: Add instruction padding Up until now we have always paid attention to make sure the length of the new instruction replacing the old one is at least less or equal to the length of the old instruction. If the new instruction is longer, at the time it replaces the old instruction it will overwrite the beginning of the next instruction in the kernel image and cause your pants to catch fire. So instead of having to pay attention, teach the alternatives framework to pad shorter old instructions with NOPs at buildtime - but only in the case when len(old instruction(s)) < len(new instruction(s)) and add nothing in the >= case. (In that case we do add_nops() when patching). This way the alternatives user shouldn't have to care about instruction sizes and simply use the macros. Add asm ALTERNATIVE* flavor macros too, while at it. Also, we need to save the pad length in a separate struct alt_instr member for NOP optimization and the way to do that reliably is to carry the pad length instead of trying to detect whether we're looking at single-byte NOPs or at pathological instruction offsets like e9 90 90 90 90, for example, which is a valid instruction. Thanks to Michael Matz for the great help with toolchain questions. Signed-off-by: Borislav Petkov --- arch/x86/include/asm/alternative-asm.h | 43 ++++++++++++++++- arch/x86/include/asm/alternative.h | 65 ++++++++++++++++---------- arch/x86/include/asm/cpufeature.h | 22 ++++++--- arch/x86/include/asm/smap.h | 4 +- arch/x86/kernel/alternative.c | 6 +-- arch/x86/kernel/entry_32.S | 2 +- arch/x86/lib/clear_page_64.S | 4 +- arch/x86/lib/copy_page_64.S | 2 +- arch/x86/lib/copy_user_64.S | 4 +- arch/x86/lib/memcpy_64.S | 4 +- arch/x86/lib/memmove_64.S | 2 +- arch/x86/lib/memset_64.S | 4 +- 12 files changed, 114 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..524bddce0b76 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,12 +18,53 @@ .endm #endif -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len + .byte \pad_len +.endm + +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 + .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..5aef6a97d80e 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,8 +48,9 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ -}; + u8 replacementlen; /* length of new instruction */ + u8 padlen; /* length of build-time padding */ +} __packed; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); @@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" +#define b_replacement(num) "664"#num +#define e_replacement(num) "665"#num -#define b_replacement(number) "663"#number -#define e_replacement(number) "664"#number +#define alt_end_marker "663" +#define alt_slen "662b-661b" +#define alt_pad_len alt_end_marker"b-662b" +#define alt_total_slen alt_end_marker"b-661b" +#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" -#define alt_slen "662b-661b" -#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" +#define __OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" -#define ALTINSTR_ENTRY(feature, number) \ +#define OLDINSTR(oldinstr, num) \ + __OLDINSTR(oldinstr, num) \ + alt_end_marker ":\n" + +/* + * Pad the second replacement alternative with additional NOPs if it is + * additionally longer than the first replacement alternative. + */ +#define OLDINSTR_2(oldinstr, num1, num2) \ + __OLDINSTR(oldinstr, num1) \ + ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \ + "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \ + alt_end_marker ":\n" + +#define ALTINSTR_ENTRY(feature, num) \ " .long 661b - .\n" /* label */ \ - " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte " alt_slen "\n" /* source len */ \ - " .byte " alt_rlen(number) "\n" /* replacement len */ + " .byte " alt_total_slen "\n" /* source len */ \ + " .byte " alt_rlen(num) "\n" /* replacement len */ \ + " .byte " alt_pad_len "\n" /* pad len */ -#define DISCARD_ENTRY(number) /* rlen <= slen */ \ - " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" - -#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ - b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" +#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ + b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - OLDINSTR(oldinstr) \ + OLDINSTR(oldinstr, 1) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR(oldinstr) \ + OLDINSTR_2(oldinstr, 1, 2) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - DISCARD_ENTRY(2) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ @@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative(oldinstr, newinstr, feature) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + /* * Alternative inline assembly with input. * diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..9b1df43dadbb 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -418,6 +418,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* 1: do replace */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); @@ -432,6 +433,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); @@ -457,6 +459,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -491,23 +494,28 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) */ asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ + " .long 4f - .\n" /* repl offset */ " .word %P1\n" /* always replace */ - " .byte 2b - 1b\n" /* src len */ - " .byte 4f - 3f\n" /* repl len */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" - "3: .byte 0xe9\n .long %l[t_no] - 2b\n" - "4:\n" + "4: .byte 0xe9\n .long %l[t_no] - 2b\n" + "5:\n" ".previous\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* src len */ + " .byte 3b - 1b\n" /* src len */ " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" : : "i" (bit), "i" (X86_FEATURE_ALWAYS) : : t_dynamic, t_no); @@ -527,6 +535,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P2\n" /* always replace */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -541,6 +550,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P1\n" /* feature bit */ " .byte 4b - 3b\n" /* src len */ " .byte 6f - 5f\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..c56cb4f37be9 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -33,7 +33,7 @@ 662: __ASM_CLAC ; \ .popsection ; \ .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ + altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ; \ .popsection #define ASM_STAC \ @@ -42,7 +42,7 @@ 662: __ASM_STAC ; \ .popsection ; \ .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ + altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ; \ .popsection #else /* CONFIG_X86_SMAP */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1e86e85bcf58..c99b0f13a90e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -270,7 +270,6 @@ void __init_or_module apply_alternatives(struct alt_instr *start, for (a = start; a < end; a++) { instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); if (!boot_cpu_has(a->cpuid)) @@ -290,8 +289,9 @@ void __init_or_module apply_alternatives(struct alt_instr *start, DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1)); } - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); + if (a->instrlen > a->replacementlen) + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); text_poke_early(instr, insnbuf, a->instrlen); } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 000d4199b03e..d23c9a1d40e1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -819,7 +819,7 @@ ENTRY(simd_coprocessor_error) 661: pushl_cfi $do_general_protection 662: .section .altinstructions,"a" - altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f + altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0 .previous .section .altinstr_replacement,"ax" 663: pushl $do_simd_coprocessor_error diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..38e57faefd71 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -67,7 +67,7 @@ ENDPROC(clear_page) .previous .section .altinstructions,"a" altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ - .Lclear_page_end-clear_page, 2b-1b + .Lclear_page_end-clear_page, 2b-1b, 0 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ - .Lclear_page_end-clear_page,3b-2b + .Lclear_page_end-clear_page,3b-2b, 0 .previous diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..f1ffdbb07755 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -106,5 +106,5 @@ ENDPROC(copy_page) .previous .section .altinstructions,"a" altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b + .Lcopy_page_end-copy_page, 2b-1b, 0 .previous diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 1530ec2c1b12..a9aedd6aa7f7 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -36,8 +36,8 @@ .previous .section .altinstructions,"a" - altinstruction_entry 0b,2b,\feature1,5,5 - altinstruction_entry 0b,3b,\feature2,5,5 + altinstruction_entry 0b,2b,\feature1,5,5,0 + altinstruction_entry 0b,3b,\feature2,5,5,0 .previous .endm diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..bbfdacc01760 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -202,7 +202,7 @@ ENDPROC(__memcpy) */ .section .altinstructions, "a" altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ - .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c + .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0 altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ - .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e + .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0 .previous diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..bbfa6b269ece 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -221,7 +221,7 @@ ENTRY(__memmove) altinstruction_entry .Lmemmove_begin_forward, \ .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ .Lmemmove_end_forward-.Lmemmove_begin_forward, \ - .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs + .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0 .previous ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..f6153c1cdddc 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -150,7 +150,7 @@ ENDPROC(__memset) */ .section .altinstructions,"a" altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-__memset,.Lmemset_e-.Lmemset_c + .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0 altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e + .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0 .previous From 48c7a2509f9e237d8465399d9cdfe487d3212a23 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 5 Jan 2015 13:48:41 +0100 Subject: [PATCH 011/148] x86/alternatives: Make JMPs more robust Up until now we had to pay attention to relative JMPs in alternatives about how their relative offset gets computed so that the jump target is still correct. Or, as it is the case for near CALLs (opcode e8), we still have to go and readjust the offset at patching time. What is more, the static_cpu_has_safe() facility had to forcefully generate 5-byte JMPs since we couldn't rely on the compiler to generate properly sized ones so we had to force the longest ones. Worse than that, sometimes it would generate a replacement JMP which is longer than the original one, thus overwriting the beginning of the next instruction at patching time. So, in order to alleviate all that and make using JMPs more straight-forward we go and pad the original instruction in an alternative block with NOPs at build time, should the replacement(s) be longer. This way, alternatives users shouldn't pay special attention so that original and replacement instruction sizes are fine but the assembler would simply add padding where needed and not do anything otherwise. As a second aspect, we go and recompute JMPs at patching time so that we can try to make 5-byte JMPs into two-byte ones if possible. If not, we still have to recompute the offsets as the replacement JMP gets put far away in the .altinstr_replacement section leading to a wrong offset if copied verbatim. For example, on a locally generated kernel image old insn VA: 0xffffffff810014bd, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff810014bd: eb 21 jmp ffffffff810014e0 repl insn: size: 5 ffffffff81d0b23c: e9 b1 62 2f ff jmpq ffffffff810014f2 gets corrected to a 2-byte JMP: apply_alternatives: feat: 3*32+21, old: (ffffffff810014bd, len: 2), repl: (ffffffff81d0b23c, len: 5) alt_insn: e9 b1 62 2f ff recompute_jumps: next_rip: ffffffff81d0b241, tgt_rip: ffffffff810014f2, new_displ: 0x00000033, ret len: 2 converted to: eb 33 90 90 90 and a 5-byte JMP: old insn VA: 0xffffffff81001516, CPU feat: X86_FEATURE_ALWAYS, size: 2 __switch_to: ffffffff81001516: eb 30 jmp ffffffff81001548 repl insn: size: 5 ffffffff81d0b241: e9 10 63 2f ff jmpq ffffffff81001556 gets shortened into a two-byte one: apply_alternatives: feat: 3*32+21, old: (ffffffff81001516, len: 2), repl: (ffffffff81d0b241, len: 5) alt_insn: e9 10 63 2f ff recompute_jumps: next_rip: ffffffff81d0b246, tgt_rip: ffffffff81001556, new_displ: 0x0000003e, ret len: 2 converted to: eb 3e 90 90 90 ... and so on. This leads to a net win of around 40ish replacements * 3 bytes savings =~ 120 bytes of I$ on an AMD guest which means some savings of precious instruction cache bandwidth. The padding to the shorter 2-byte JMPs are single-byte NOPs which on smart microarchitectures means discarding NOPs at decode time and thus freeing up execution bandwidth. Signed-off-by: Borislav Petkov --- arch/x86/include/asm/cpufeature.h | 10 +-- arch/x86/kernel/alternative.c | 103 ++++++++++++++++++++++++++++-- arch/x86/lib/copy_user_64.S | 11 ++-- 3 files changed, 105 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 9b1df43dadbb..e0571a24d582 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -486,13 +486,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { #ifdef CC_HAVE_ASM_GOTO -/* - * We need to spell the jumps to the compiler because, depending on the offset, - * the replacement jump can be bigger than the original jump, and this we cannot - * have. Thus, we force the jump to the widest, 4-byte, signed relative - * offset even though the last would often fit in less bytes. - */ - asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" ".skip -(((5f-4f) - (2b-1b)) > 0) * " "((5f-4f) - (2b-1b)),0x90\n" @@ -506,7 +500,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .byte 3b - 2b\n" /* pad len */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" - "4: .byte 0xe9\n .long %l[t_no] - 2b\n" + "4: jmp %l[t_no]\n" "5:\n" ".previous\n" ".section .altinstructions,\"a\"\n" diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c99b0f13a90e..715af37bf008 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -58,6 +58,21 @@ do { \ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ } while (0) +#define DUMP_BYTES(buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG fmt, ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ +} while (0) + /* * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes * that correspond to that nop. Getting from one nop to the next, we @@ -243,6 +258,71 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void *text_poke_early(void *addr, const void *opcode, size_t len); +/* + * Are we looking at a near JMP with a 1 or 4-byte displacement. + */ +static inline bool is_jmp(const u8 opcode) +{ + return opcode == 0xeb || opcode == 0xe9; +} + +static void __init_or_module +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +{ + u8 *next_rip, *tgt_rip; + s32 n_dspl, o_dspl; + int repl_len; + + if (a->replacementlen != 5) + return; + + o_dspl = *(s32 *)(insnbuf + 1); + + /* next_rip of the replacement JMP */ + next_rip = repl_insn + a->replacementlen; + /* target rip of the replacement JMP */ + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + + DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) + goto two_byte_jmp; + else + goto five_byte_jmp; + /* negative offset */ + } else { + if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) + goto two_byte_jmp; + else + goto five_byte_jmp; + } + +two_byte_jmp: + n_dspl -= 2; + + insnbuf[0] = 0xeb; + insnbuf[1] = (s8)n_dspl; + add_nops(insnbuf + 2, 3); + + repl_len = 2; + goto done; + +five_byte_jmp: + n_dspl -= 5; + + insnbuf[0] = 0xe9; + *(s32 *)&insnbuf[1] = n_dspl; + + repl_len = 5; + +done: + + DPRINTK("final displ: 0x%08x, JMP 0x%lx", + n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -268,6 +348,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { + int insnbuf_sz = 0; + instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insnbuf)); @@ -281,24 +363,35 @@ void __init_or_module apply_alternatives(struct alt_instr *start, instr, a->instrlen, replacement, a->replacementlen); + DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); + memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; /* 0xe8 is a relative jump; fix the offset. */ if (*insnbuf == 0xe8 && a->replacementlen == 5) { *(s32 *)(insnbuf + 1) += replacement - instr; - DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1)); + DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + *(s32 *)(insnbuf + 1), + (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); } - if (a->instrlen > a->replacementlen) + if (a->replacementlen && is_jmp(replacement[0])) + recompute_jump(a, instr, replacement, insnbuf); + + if (a->instrlen > a->replacementlen) { add_nops(insnbuf + a->replacementlen, a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); - text_poke_early(instr, insnbuf, a->instrlen); + text_poke_early(instr, insnbuf, insnbuf_sz); } } #ifdef CONFIG_SMP - static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { @@ -449,7 +542,7 @@ int alternatives_text_reserved(void *start, void *end) return 0; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT void __init_or_module apply_paravirt(struct paravirt_patch_site *start, diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index a9aedd6aa7f7..dad718ce805c 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -25,14 +25,13 @@ */ .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ + jmp \orig 1: .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt1-1b /* offset */ /* or alternatively to alt1 */ -3: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt2-1b /* offset */ /* or alternatively to alt2 */ +2: + jmp \alt1 +3: + jmp \alt2 .previous .section .altinstructions,"a" From 4fd4b6e5537cec5b56db0b22546dd439ebb26830 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 10 Jan 2015 20:34:07 +0100 Subject: [PATCH 012/148] x86/alternatives: Use optimized NOPs for padding Alternatives allow now for an empty old instruction. In this case we go and pad the space with NOPs at assembly time. However, there are the optimal, longer NOPs which should be used. Do that at patching time by adding alt_instr.padlen-sized NOPs at the old instruction address. Cc: Andy Lutomirski Signed-off-by: Borislav Petkov --- arch/x86/kernel/alternative.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 715af37bf008..af397cc98d05 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -323,6 +323,14 @@ done: n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); } +static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +{ + add_nops(instr + (a->instrlen - a->padlen), a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + instr, a->instrlen - a->padlen, a->padlen); +} + /* * Replace instructions with better alternatives for this CPU type. This runs * before SMP is initialized to avoid SMP problems with self modifying code. @@ -354,8 +362,12 @@ void __init_or_module apply_alternatives(struct alt_instr *start, replacement = (u8 *)&a->repl_offset + a->repl_offset; BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) + if (!boot_cpu_has(a->cpuid)) { + if (a->padlen > 1) + optimize_nops(a, instr); + continue; + } DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)", a->cpuid >> 5, From 090a3f615524c3f75d09fdb37f15ea1868d79f7e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 12 Jan 2015 18:19:40 +0100 Subject: [PATCH 013/148] x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro ... instead of the semi-version with the spelled out sections. What is more, make the REP_GOOD version be the default copy_page() version as the majority of the relevant x86 CPUs do set X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to: ffffffff8130af80 : ffffffff8130af80: e9 0b 00 00 00 jmpq ffffffff8130af90 ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) ffffffff8130af8d: c3 retq ffffffff8130af8e: 66 90 xchg %ax,%ax ffffffff8130af90 : ... and after the alternatives have run, the JMP to the old, unrolled version gets NOPed out: ffffffff8130af80 : ffffffff8130af80: 66 66 90 xchg %ax,%ax ffffffff8130af83: 66 90 xchg %ax,%ax ffffffff8130af85: b9 00 02 00 00 mov $0x200,%ecx ffffffff8130af8a: f3 48 a5 rep movsq %ds:(%rsi),%es:(%rdi) ffffffff8130af8d: c3 retq On modern uarches, those NOPs are cheaper than the unconditional JMP previously. Signed-off-by: Borislav Petkov --- arch/x86/lib/copy_page_64.S | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index f1ffdbb07755..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,23 +2,26 @@ #include #include +#include #include +/* + * Some CPUs run faster using the string copy instructions (sane microcode). + * It is also a lot simpler. Use this when possible. But, don't use streaming + * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the + * prefetch distance based on SMP/UP. + */ ALIGN -copy_page_rep: +ENTRY(copy_page) CFI_STARTPROC + ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_rep) +ENDPROC(copy_page) -/* - * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. - * Could vary the prefetch distance based on SMP/UP. -*/ - -ENTRY(copy_page) +ENTRY(copy_page_regs) CFI_STARTPROC subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 @@ -90,21 +93,5 @@ ENTRY(copy_page) addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret -.Lcopy_page_end: CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b, 0 - .previous +ENDPROC(copy_page_regs) From de2ff888843a22918ef15306922083739d23dad3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 13 Jan 2015 01:38:17 +0100 Subject: [PATCH 014/148] x86/lib/copy_user_64.S: Convert to ALTERNATIVE_2 Use the asm macro and drop the locally grown version. Signed-off-by: Borislav Petkov --- arch/x86/lib/copy_user_64.S | 40 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dad718ce805c..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,30 +16,6 @@ #include #include -/* - * By placing feature2 after feature1 in altinstructions section, we logically - * implement: - * If CPU has feature2, jmp to alt2 is used - * else if CPU has feature1, jmp to alt1 is used - * else jmp to orig is used. - */ - .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 -0: - jmp \orig -1: - .section .altinstr_replacement,"ax" -2: - jmp \alt1 -3: - jmp \alt2 - .previous - - .section .altinstructions,"a" - altinstruction_entry 0b,2b,\feature1,5,5,0 - altinstruction_entry 0b,3b,\feature2,5,5,0 - .previous - .endm - .macro ALIGN_DESTINATION /* check for bad alignment of destination */ movl %edi,%ecx @@ -73,9 +49,11 @@ ENTRY(_copy_to_user) jc bad_to_user cmpq TI_addr_limit(%rax),%rcx ja bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_to_user) @@ -88,9 +66,11 @@ ENTRY(_copy_from_user) jc bad_from_user cmpq TI_addr_limit(%rax),%rcx ja bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_from_user) From 669f8a900198599d3c2e2e463bafe12d30d96507 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 15 Jan 2015 09:17:12 +0100 Subject: [PATCH 015/148] x86/smap: Use ALTERNATIVE macro ... and drop unfolded version. No need for ASM_NOP3 anymore either as the alternatives do the proper padding at build time and insert proper NOPs at boot time. There should be no apparent operational change from this patch. Signed-off-by: Borislav Petkov --- arch/x86/include/asm/smap.h | 28 ++++++++-------------------- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index c56cb4f37be9..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -27,23 +27,11 @@ #ifdef CONFIG_X86_SMAP -#define ASM_CLAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_CLAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ; \ - .popsection +#define ASM_CLAC \ + ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP -#define ASM_STAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_STAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ; \ - .popsection +#define ASM_STAC \ + ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -61,20 +49,20 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ From 8e65f6e03a90927b8de16c15da976baa6c3fff69 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 18 Jan 2015 12:35:55 +0100 Subject: [PATCH 016/148] x86/entry_32: Convert X86_INVD_BUG to ALTERNATIVE macro Booting a 486 kernel on an AMD guest with this patch applied, says: apply_alternatives: feat: 0*32+25, old: (c160a475, len: 5), repl: (c19557d4, len: 5) c160a475: alt_insn: 68 10 35 00 c1 c19557d4: rpl_insn: 68 80 39 00 c1 which is: old insn VA: 0xc160a475, CPU feat: X86_FEATURE_XMM, size: 5 simd_coprocessor_error: c160a475: 68 10 35 00 c1 push $0xc1003510 repl insn: 0xc19557d4, size: 5 c160a475: 68 80 39 00 c1 push $0xc1003980 Signed-off-by: Borislav Petkov --- arch/x86/kernel/entry_32.S | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index d23c9a1d40e1..b910577d1ff9 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error) pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl_cfi $do_general_protection -662: -.section .altinstructions,"a" - altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0 -.previous -.section .altinstr_replacement,"ax" -663: pushl $do_simd_coprocessor_error -664: -.previous + ALTERNATIVE "pushl_cfi $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM #else pushl_cfi $do_simd_coprocessor_error #endif From 6620ef28c812b4782b10adb676580c2847d2bec8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 18 Jan 2015 12:57:41 +0100 Subject: [PATCH 017/148] x86/lib/clear_page_64.S: Convert to ALTERNATIVE_2 macro Move clear_page() up so that we can get 2-byte forward JMPs when patching: apply_alternatives: feat: 3*32+16, old: (ffffffff8130adb0, len: 5), repl: (ffffffff81d0b859, len: 5) ffffffff8130adb0: alt_insn: 90 90 90 90 90 recompute_jump: new_displ: 0x0000003e ffffffff81d0b859: rpl_insn: eb 3e 66 66 90 even though the compiler generated 5-byte JMPs which we padded with 5 NOPs. Also, make the REP_GOOD version be the default as the majority of machines set REP_GOOD. This way we get to save ourselves the JMP: old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_REP_GOOD, size: 5, padlen: 0 clear_page: ffffffff813038b0 : ffffffff813038b0: e9 0b 00 00 00 jmpq ffffffff813038c0 repl insn: 0xffffffff81cf0e92, size: 0 old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_ERMS, size: 5, padlen: 0 clear_page: ffffffff813038b0 : ffffffff813038b0: e9 0b 00 00 00 jmpq ffffffff813038c0 repl insn: 0xffffffff81cf0e92, size: 5 ffffffff81cf0e92: e9 69 2a 61 ff jmpq ffffffff81303900 ffffffff813038b0 : ffffffff813038b0: e9 69 2a 61 ff jmpq ffffffff8091631e Signed-off-by: Borislav Petkov --- arch/x86/lib/clear_page_64.S | 66 +++++++++++++++--------------------- 1 file changed, 27 insertions(+), 39 deletions(-) diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 38e57faefd71..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,31 +1,35 @@ #include #include +#include #include /* - * Zero a page. - * rdi page - */ -ENTRY(clear_page_c) + * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is + * recommended to use this when possible and we do use them by default. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original. + */ + +/* + * Zero a page. + * %rdi - page + */ +ENTRY(clear_page) CFI_STARTPROC + + ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp clear_page_c_e", X86_FEATURE_ERMS + movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page_c) +ENDPROC(clear_page) -ENTRY(clear_page_c_e) +ENTRY(clear_page_orig) CFI_STARTPROC - movl $4096,%ecx - xorl %eax,%eax - rep stosb - ret - CFI_ENDPROC -ENDPROC(clear_page_c_e) -ENTRY(clear_page) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -45,29 +49,13 @@ ENTRY(clear_page) nop ret CFI_ENDPROC -.Lclear_page_end: -ENDPROC(clear_page) +ENDPROC(clear_page_orig) - /* - * Some CPUs support enhanced REP MOVSB/STOSB instructions. - * It is recommended to use this when possible. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original function. - * - */ - -#include - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: .byte 0xeb /* jmp */ - .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ -3: - .previous - .section .altinstructions,"a" - altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ - .Lclear_page_end-clear_page, 2b-1b, 0 - altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ - .Lclear_page_end-clear_page,3b-2b, 0 - .previous +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) From c70e1b475f37f07ab7181ad28458666d59aae634 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 18 Jan 2015 15:19:55 +0100 Subject: [PATCH 018/148] x86/asm: Use alternative_2() in rdtsc_barrier() ... now that we have it. Acked-by: Andy Lutomirski Cc: Richard Weinberger Signed-off-by: Borislav Petkov --- arch/x86/include/asm/barrier.h | 6 ++---- arch/x86/um/asm/barrier.h | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -95,13 +95,11 @@ do { \ * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined * code region. - * - * (Could use an alternative three way for this if there was one.) */ static __always_inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..8ffd2146fa6a 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -64,8 +64,8 @@ */ static inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif From a930dc4543a2b213deb9fde12682716edff8a4a6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 18 Jan 2015 17:48:18 +0100 Subject: [PATCH 019/148] x86/asm: Cleanup prefetch primitives This is based on a patch originally by hpa. With the current improvements to the alternatives, we can simply use %P1 as a mem8 operand constraint and rely on the toolchain to generate the proper instruction sizes. For example, on 32-bit, where we use an empty old instruction we get: apply_alternatives: feat: 6*32+8, old: (c104648b, len: 4), repl: (c195566c, len: 4) c104648b: alt_insn: 90 90 90 90 c195566c: rpl_insn: 0f 0d 4b 5c ... apply_alternatives: feat: 6*32+8, old: (c18e09b4, len: 3), repl: (c1955948, len: 3) c18e09b4: alt_insn: 90 90 90 c1955948: rpl_insn: 0f 0d 08 ... apply_alternatives: feat: 6*32+8, old: (c1190cf9, len: 7), repl: (c1955a79, len: 7) c1190cf9: alt_insn: 90 90 90 90 90 90 90 c1955a79: rpl_insn: 0f 0d 0d a0 d4 85 c1 all with the proper padding done depending on the size of the replacement instruction the compiler generates. Signed-off-by: Borislav Petkov Cc: H. Peter Anvin --- arch/x86/include/asm/apic.h | 2 +- arch/x86/include/asm/processor.h | 16 +++++++--------- arch/x86/kernel/cpu/amd.c | 5 +++++ 3 files changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index efc3b22d896e..8118e94d50ab 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, + alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..7be2c9a6caba 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -761,10 +761,10 @@ extern char ignore_fpu_irq; #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -# define BASE_PREFETCH ASM_NOP4 +# define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 %P1" #endif /* @@ -775,10 +775,9 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchnta (%1)", + alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, - "r" (x)); + "m" (*(const char *)x)); } /* @@ -788,10 +787,9 @@ static inline void prefetch(const void *x) */ static inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); + alternative_input(BASE_PREFETCH, "prefetchw %P1", + X86_FEATURE_3DNOWPREFETCH, + "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..dd9e50500297 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* 3DNow or LM implies PREFETCHW */ + if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) + if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) + set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); } #ifdef CONFIG_X86_32 From 84d95ad4cb9015ea953bf14cea05ba371d4d42bb Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Feb 2015 08:57:00 +0100 Subject: [PATCH 020/148] x86/lib/memset_64.S: Convert to ALTERNATIVE_2 macro Make alternatives replace single JMPs instead of whole memset functions, thus decreasing the amount of instructions copied during patching time at boot. While at it, make it use the REP_GOOD version by default which means alternatives NOP out the JMP to the other versions, as REP_GOOD is set by default on the majority of relevant x86 processors. Signed-off-by: Borislav Petkov --- arch/x86/lib/memset_64.S | 61 ++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 37 deletions(-) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index f6153c1cdddc..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -5,19 +5,30 @@ #include #include +.weak memset + /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is * simpler and shorter than the orignal function as well. - * + * * rdi destination - * rsi value (char) - * rdx count (bytes) - * + * rsi value (char) + * rdx count (bytes) + * * rax original destination - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c: + */ +ENTRY(memset) +ENTRY(__memset) + /* + * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended + * to use it when possible. If not available, use fast string instructions. + * + * Otherwise, use original memset function. + */ + ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memset_erms", X86_FEATURE_ERMS + movq %rdi,%r9 movq %rdx,%rcx andl $7,%edx @@ -31,8 +42,8 @@ rep stosb movq %r9,%rax ret -.Lmemset_e: - .previous +ENDPROC(memset) +ENDPROC(__memset) /* * ISO C memset - set a memory block to a byte value. This function uses @@ -45,21 +56,16 @@ * * rax original destination */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c_e: +ENTRY(memset_erms) movq %rdi,%r9 movb %sil,%al movq %rdx,%rcx rep stosb movq %r9,%rax ret -.Lmemset_e_e: - .previous +ENDPROC(memset_erms) -.weak memset - -ENTRY(memset) -ENTRY(__memset) +ENTRY(memset_orig) CFI_STARTPROC movq %rdi,%r10 @@ -134,23 +140,4 @@ ENTRY(__memset) jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs support enhanced REP MOVSB/STOSB feature. - * It is recommended to use this when possible. - * - * If enhanced REP MOVSB/STOSB feature is not available, use fast string - * instructions. - * - * Otherwise, use original memset function. - * - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - */ - .section .altinstructions,"a" - altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0 - altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0 - .previous +ENDPROC(memset_orig) From a77600cd03a44e54df99ea742e02f4899b82c8e1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Feb 2015 09:11:17 +0100 Subject: [PATCH 021/148] x86/lib/memmove_64.S: Convert memmove() to ALTERNATIVE macro Make it execute the ERMS version if support is present and we're in the forward memmove() part and remove the unfolded alternatives section definition. Signed-off-by: Borislav Petkov --- arch/x86/lib/memmove_64.S | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index bbfa6b269ece..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -5,7 +5,6 @@ * This assembly file is re-written from memmove_64.c file. * - Copyright 2011 Fenghua Yu */ -#define _STRING_C #include #include #include @@ -44,6 +43,8 @@ ENTRY(__memmove) jg 2f .Lmemmove_begin_forward: + ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS + /* * movsq instruction have many startup latency * so we handle small size by general register. @@ -207,21 +208,5 @@ ENTRY(__memmove) 13: retq CFI_ENDPROC - - .section .altinstr_replacement,"ax" -.Lmemmove_begin_forward_efs: - /* Forward moving data. */ - movq %rdx, %rcx - rep movsb - retq -.Lmemmove_end_forward_efs: - .previous - - .section .altinstructions,"a" - altinstruction_entry .Lmemmove_begin_forward, \ - .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ - .Lmemmove_end_forward-.Lmemmove_begin_forward, \ - .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0 - .previous ENDPROC(__memmove) ENDPROC(memmove) From e0bc8d179e39a31bb3a56974374e55374fbf29be Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Feb 2015 15:36:49 +0100 Subject: [PATCH 022/148] x86/lib/memcpy_64.S: Convert memcpy to ALTERNATIVE_2 macro Make REP_GOOD variant the default after alternatives have run. Signed-off-by: Borislav Petkov --- arch/x86/lib/memcpy_64.S | 68 +++++++++++++--------------------------- 1 file changed, 21 insertions(+), 47 deletions(-) diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index bbfdacc01760..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,11 +1,19 @@ /* Copyright 2002 Andi Kleen */ #include - #include #include #include +/* + * We build a jump to memcpy_orig by default which gets NOPped out on + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which + * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs + * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. + */ + +.weak memcpy + /* * memcpy - Copy a memory block. * @@ -17,15 +25,11 @@ * Output: * rax original destination */ +ENTRY(__memcpy) +ENTRY(memcpy) + ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memcpy_erms", X86_FEATURE_ERMS -/* - * memcpy_c() - fast string ops (REP MOVSQ) based variant. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c: movq %rdi, %rax movq %rdx, %rcx shrq $3, %rcx @@ -34,29 +38,21 @@ movl %edx, %ecx rep movsb ret -.Lmemcpy_e: - .previous +ENDPROC(memcpy) +ENDPROC(__memcpy) /* - * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than - * memcpy_c. Use memcpy_c_e when possible. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: + * memcpy_erms() - enhanced fast string memcpy. This is faster and + * simpler than memcpy. Use memcpy_erms when possible. */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c_e: +ENTRY(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb ret -.Lmemcpy_e_e: - .previous +ENDPROC(memcpy_erms) -.weak memcpy - -ENTRY(__memcpy) -ENTRY(memcpy) +ENTRY(memcpy_orig) CFI_STARTPROC movq %rdi, %rax @@ -183,26 +179,4 @@ ENTRY(memcpy) .Lend: retq CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* - * Some CPUs are adding enhanced REP MOVSB/STOSB feature - * If the feature is supported, memcpy_c_e() is the first choice. - * If enhanced rep movsb copy is not available, use fast string copy - * memcpy_c() when possible. This is faster and code is simpler than - * original memcpy(). - * Otherwise, original memcpy() is used. - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - * - * Replace only beginning, memcpy is used to apply alternatives, - * so it is silly to overwrite itself with nops - reboot is the - * only outcome... - */ - .section .altinstructions, "a" - altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ - .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0 - altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ - .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0 - .previous +ENDPROC(memcpy_orig) From 08571f1ae327bfb631cb7171bde5ea605df626c6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 24 Feb 2015 16:01:38 -0800 Subject: [PATCH 023/148] x86/ptrace: Remove checks for TIF_IA32 when changing CS and SS The ability for modified CS and/or SS to be useful has nothing to do with TIF_IA32. Similarly, if there's an exploit involving changing CS or SS, it's exploitable with or without a TIF_IA32 check. So just delete the check. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/71c7ab36456855d11ae07edd4945a7dfe80f9915.1424822291.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618b2e91..1e125817cf9f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->cs = value; -#endif + task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->ss = value; -#endif + task_pt_regs(task)->ss = value; break; } From 72c6fb4f74b6b3797f5b1abd6944d7a1d2adbf04 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 24 Feb 2015 16:01:39 -0800 Subject: [PATCH 024/148] x86/ia32-compat: Fix CLONE_SETTLS bitness of copy_thread() CLONE_SETTLS is expected to write a TLS entry in the GDT for 32-bit callers and to set FSBASE for 64-bit callers. The correct check is is_ia32_task(), which returns true in the context of a 32-bit syscall. TIF_IA32 is set if the task itself has a 32-bit personality, which is not the same thing. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Link: http://lkml.kernel.org/r/45e2d0d695393d76406a0c7225b82c76223e0cc5.1424822291.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 5a2c02913af3..936d43461dca 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -207,7 +207,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) + if (is_ia32_task()) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); else From b4d8327024637cb2a1f7910dcb5d0ad7a096f473 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Thu, 26 Feb 2015 13:49:39 +0800 Subject: [PATCH 025/148] x86/traps: Enable DEBUG_STACK after cpu_init() for TRAP_DB/BP Before this patch early_trap_init() installs DEBUG_STACK for X86_TRAP_BP and X86_TRAP_DB. However, DEBUG_STACK doesn't work correctly until cpu_init() <-- trap_init(). This patch passes 0 to set_intr_gate_ist() and set_system_intr_gate_ist() instead of DEBUG_STACK to let it use same stack as kernel, and installs DEBUG_STACK for them in trap_init(). As core runs at ring 0 between early_trap_init() and trap_init(), there is no chance to get a bad stack before trap_init(). As NMI is also enabled in trap_init(), we don't need to care about is_debug_stack() and related things used in arch/x86/kernel/nmi.c. Signed-off-by: Wang Nan Reviewed-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/1424929779-13174-1-git-send-email-wangnan0@huawei.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9d2073e2ecc9..42819886be0c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -925,9 +925,17 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* + * Don't set ist to DEBUG_STACK as it doesn't work until TSS is + * ready in cpu_init() <-- trap_init(). Before trap_init(), CPU + * runs at ring 0 so it is impossible to hit an invalid stack. + * Using the original stack works well enough at this early + * stage. DEBUG_STACK will be equipped after cpu_init() in + * trap_init(). + */ + set_intr_gate_ist(X86_TRAP_DB, &debug, 0); /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_system_intr_gate_ist(X86_TRAP_BP, &int3, 0); #ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, page_fault); #endif @@ -1005,6 +1013,15 @@ void __init trap_init(void) */ cpu_init(); + /* + * X86_TRAP_DB and X86_TRAP_BP have been set + * in early_trap_init(). However, DEBUG_STACK works only after + * cpu_init() loads TSS. See comments in early_trap_init(). + */ + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + x86_init.irqs.trap_init(); #ifdef CONFIG_X86_64 From 0cf55934ecace74bb7d26c0e9679fb41675a8903 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 26 Feb 2015 18:38:16 +0100 Subject: [PATCH 026/148] perf/bench: Fix mem* routines usage after alternatives change Adjust perf bench to the new changes in the alternatives code for memcpy/memset. Reviewed-by: Hitoshi Mitake Signed-off-by: Borislav Petkov --- tools/perf/bench/mem-memcpy-x86-64-asm-def.h | 6 +++--- tools/perf/bench/mem-memcpy-x86-64-asm.S | 2 -- tools/perf/bench/mem-memset-x86-64-asm-def.h | 6 +++--- tools/perf/bench/mem-memset-x86-64-asm.S | 2 -- tools/perf/util/include/asm/alternative-asm.h | 1 + 5 files changed, 7 insertions(+), 10 deletions(-) diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index d66ab799b35f..8c0c1a2770c8 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -1,12 +1,12 @@ -MEMCPY_FN(__memcpy, +MEMCPY_FN(memcpy_orig, "x86-64-unrolled", "unrolled memcpy() in arch/x86/lib/memcpy_64.S") -MEMCPY_FN(memcpy_c, +MEMCPY_FN(__memcpy, "x86-64-movsq", "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") -MEMCPY_FN(memcpy_c_e, +MEMCPY_FN(memcpy_erms, "x86-64-movsb", "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index fcd9cf00600a..e4c2c30143b9 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -1,8 +1,6 @@ #define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define altinstr_replacement text #define globl p2align 4; .globl -#define Lmemcpy_c globl memcpy_c; memcpy_c -#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e #include "../../../arch/x86/lib/memcpy_64.S" /* * We need to provide note.GNU-stack section, saying that we want diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index a71dff97c1f5..f02d028771d9 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -1,12 +1,12 @@ -MEMSET_FN(__memset, +MEMSET_FN(memset_orig, "x86-64-unrolled", "unrolled memset() in arch/x86/lib/memset_64.S") -MEMSET_FN(memset_c, +MEMSET_FN(__memset, "x86-64-stosq", "movsq-based memset() in arch/x86/lib/memset_64.S") -MEMSET_FN(memset_c_e, +MEMSET_FN(memset_erms, "x86-64-stosb", "movsb-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index 9e5af89ed13a..de278784c866 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S @@ -1,8 +1,6 @@ #define memset MEMSET /* don't hide glibc's memset() */ #define altinstr_replacement text #define globl p2align 4; .globl -#define Lmemset_c globl memset_c; memset_c -#define Lmemset_c_e globl memset_c_e; memset_c_e #include "../../../arch/x86/lib/memset_64.S" /* diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h index 6789d788d494..3a3a0f16456a 100644 --- a/tools/perf/util/include/asm/alternative-asm.h +++ b/tools/perf/util/include/asm/alternative-asm.h @@ -4,5 +4,6 @@ /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ #define altinstruction_entry # +#define ALTERNATIVE_2 # #endif From 515e23f0193f42dab3b3e89f2c54da16b8bc3481 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 26 Feb 2015 18:51:37 +0100 Subject: [PATCH 027/148] perf/bench: Carve out mem routine benchmarking ... so that we can call it multiple times. See next patch. Reviewed-by: Hitoshi Mitake Signed-off-by: Borislav Petkov --- tools/perf/bench/mem-memcpy.c | 120 ++++++++++++++++------------------ 1 file changed, 58 insertions(+), 62 deletions(-) diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index 6c14afe8c1b1..50991c459ee1 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -135,84 +135,32 @@ struct bench_mem_info { const char *const *usage; }; -static int bench_mem_common(int argc, const char **argv, - const char *prefix __maybe_unused, - struct bench_mem_info *info) +static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen) { - int i; - size_t len; - double totallen; + const struct routine *r = &info->routines[r_idx]; double result_bps[2]; u64 result_cycle[2]; - argc = parse_options(argc, argv, options, - info->usage, 0); - - if (no_prefault && only_prefault) { - fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); - return 1; - } - - if (use_cycle) - init_cycle(); - - len = (size_t)perf_atoll((char *)length_str); - totallen = (double)len * iterations; - result_cycle[0] = result_cycle[1] = 0ULL; result_bps[0] = result_bps[1] = 0.0; - if ((s64)len <= 0) { - fprintf(stderr, "Invalid length:%s\n", length_str); - return 1; - } - - /* same to without specifying either of prefault and no-prefault */ - if (only_prefault && no_prefault) - only_prefault = no_prefault = false; - - for (i = 0; info->routines[i].name; i++) { - if (!strcmp(info->routines[i].name, routine)) - break; - } - if (!info->routines[i].name) { - printf("Unknown routine:%s\n", routine); - printf("Available routines...\n"); - for (i = 0; info->routines[i].name; i++) { - printf("\t%s ... %s\n", - info->routines[i].name, info->routines[i].desc); - } - return 1; - } - if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s Bytes ...\n\n", length_str); if (!only_prefault && !no_prefault) { /* show both of results */ if (use_cycle) { - result_cycle[0] = - info->do_cycle(&info->routines[i], len, false); - result_cycle[1] = - info->do_cycle(&info->routines[i], len, true); + result_cycle[0] = info->do_cycle(r, len, false); + result_cycle[1] = info->do_cycle(r, len, true); } else { - result_bps[0] = - info->do_gettimeofday(&info->routines[i], - len, false); - result_bps[1] = - info->do_gettimeofday(&info->routines[i], - len, true); + result_bps[0] = info->do_gettimeofday(r, len, false); + result_bps[1] = info->do_gettimeofday(r, len, true); } } else { - if (use_cycle) { - result_cycle[pf] = - info->do_cycle(&info->routines[i], - len, only_prefault); - } else { - result_bps[pf] = - info->do_gettimeofday(&info->routines[i], - len, only_prefault); - } + if (use_cycle) + result_cycle[pf] = info->do_cycle(r, len, only_prefault); + else + result_bps[pf] = info->do_gettimeofday(r, len, only_prefault); } switch (bench_format) { @@ -265,6 +213,54 @@ static int bench_mem_common(int argc, const char **argv, die("unknown format: %d\n", bench_format); break; } +} + +static int bench_mem_common(int argc, const char **argv, + const char *prefix __maybe_unused, + struct bench_mem_info *info) +{ + int i; + size_t len; + double totallen; + + argc = parse_options(argc, argv, options, + info->usage, 0); + + if (no_prefault && only_prefault) { + fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); + return 1; + } + + if (use_cycle) + init_cycle(); + + len = (size_t)perf_atoll((char *)length_str); + totallen = (double)len * iterations; + + if ((s64)len <= 0) { + fprintf(stderr, "Invalid length:%s\n", length_str); + return 1; + } + + /* same to without specifying either of prefault and no-prefault */ + if (only_prefault && no_prefault) + only_prefault = no_prefault = false; + + for (i = 0; info->routines[i].name; i++) { + if (!strcmp(info->routines[i].name, routine)) + break; + } + if (!info->routines[i].name) { + printf("Unknown routine:%s\n", routine); + printf("Available routines...\n"); + for (i = 0; info->routines[i].name; i++) { + printf("\t%s ... %s\n", + info->routines[i].name, info->routines[i].desc); + } + return 1; + } + + __bench_mem_routine(info, i, len, totallen); return 0; } From dfecb95cdfeaf7872d83a96bec3a606e9cd95c8d Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 26 Feb 2015 19:02:43 +0100 Subject: [PATCH 028/148] perf/bench: Add -r all so that you can run all mem* routines perf bench mem mem{set,cpy} -r all thus runs all available mem benchmarking routines. Reviewed-by: Hitoshi Mitake Signed-off-by: Borislav Petkov --- tools/perf/bench/mem-memcpy.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index 50991c459ee1..eca3d0e49582 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -36,7 +36,7 @@ static const struct option options[] = { "Specify length of memory to copy. " "Available units: B, KB, MB, GB and TB (upper and lower)"), OPT_STRING('r', "routine", &routine, "default", - "Specify routine to copy"), + "Specify routine to copy, \"all\" runs all available routines"), OPT_INTEGER('i', "iterations", &iterations, "repeat memcpy() invocation this number of times"), OPT_BOOLEAN('c', "cycle", &use_cycle, @@ -144,6 +144,8 @@ static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t l result_cycle[0] = result_cycle[1] = 0ULL; result_bps[0] = result_bps[1] = 0.0; + printf("Routine %s (%s)\n", r->name, r->desc); + if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s Bytes ...\n\n", length_str); @@ -246,6 +248,12 @@ static int bench_mem_common(int argc, const char **argv, if (only_prefault && no_prefault) only_prefault = no_prefault = false; + if (!strncmp(routine, "all", 3)) { + for (i = 0; info->routines[i].name; i++) + __bench_mem_routine(info, i, len, totallen); + return 0; + } + for (i = 0; info->routines[i].name; i++) { if (!strcmp(info->routines[i].name, routine)) break; From 29a5ff97fa0d8045d262a772c3853e3ef1ed98d8 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 3 Mar 2015 22:31:32 -0500 Subject: [PATCH 029/148] x86/compat: Remove compat_ni_syscall() compat_ni_syscall() does the same thing as sys_ni_syscall(). Signed-off-by: Brian Gerst Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1425439896-8322-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/Makefile | 2 +- arch/x86/ia32/nosyscall.c | 7 ------- arch/x86/ia32/syscall_ia32.c | 4 ++-- 3 files changed, 3 insertions(+), 10 deletions(-) delete mode 100644 arch/x86/ia32/nosyscall.c diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e785b422b766..e66d85021aa2 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,7 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o -obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o +obj-$(CONFIG_IA32_EMULATION) += syscall_ia32.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c deleted file mode 100644 index 51ecd5b4e787..000000000000 --- a/arch/x86/ia32/nosyscall.c +++ /dev/null @@ -1,7 +0,0 @@ -#include -#include - -long compat_ni_syscall(void) -{ - return -ENOSYS; -} diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c index 4754ba0f5d9f..3429b14793e3 100644 --- a/arch/x86/ia32/syscall_ia32.c +++ b/arch/x86/ia32/syscall_ia32.c @@ -13,13 +13,13 @@ typedef void (*sys_call_ptr_t)(void); -extern void compat_ni_syscall(void); +extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, #include }; From 2aa4a710928863e84cb71e60b7c839d12403f5ca Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 3 Mar 2015 22:31:33 -0500 Subject: [PATCH 030/148] x86/compat: Merge native and compat 32-bit syscall tables Combine the 32-bit syscall tables into one file. Signed-off-by: Brian Gerst Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1425439896-8322-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/Makefile | 1 - arch/x86/ia32/syscall_ia32.c | 25 ------------------------- arch/x86/kernel/Makefile | 1 + arch/x86/kernel/syscall_32.c | 16 ++++++++++++---- 4 files changed, 13 insertions(+), 30 deletions(-) delete mode 100644 arch/x86/ia32/syscall_ia32.c diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e66d85021aa2..bb635c641869 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,6 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o -obj-$(CONFIG_IA32_EMULATION) += syscall_ia32.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c deleted file mode 100644 index 3429b14793e3..000000000000 --- a/arch/x86/ia32/syscall_ia32.c +++ /dev/null @@ -1,25 +0,0 @@ -/* System call table for ia32 emulation. */ - -#include -#include -#include -#include - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; -#include -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = compat, - -typedef void (*sys_call_ptr_t)(void); - -extern asmlinkage void sys_ni_syscall(void); - -const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, -#include -}; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 5d4502c8b983..62fbe7177cdd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -28,6 +28,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o +obj-$(CONFIG_IA32_EMULATION) += syscall_32.o obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57d8a9e..3777189c4a19 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -5,21 +5,29 @@ #include #include -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; #include #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, #include }; From 7e8e385aaf6ed5b64b5d9108081cfcdcdd021b78 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Tue, 3 Mar 2015 22:31:34 -0500 Subject: [PATCH 031/148] x86/compat: Remove sys32_vm86_warning The check against lastcomm is racy, and the message it produces isn't necessary. vm86 support can be disabled on a 32-bit kernel also, and doesn't have this message. Switch to sys_ni_syscall instead. Signed-off-by: Brian Gerst Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1425439896-8322-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/sys_ia32.c | 14 -------------- arch/x86/syscalls/syscall_32.tbl | 4 ++-- 2 files changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 8e0ceecdc957..719cd702b0a4 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO - "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) { diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index b3560ece1c9f..ef8187f9d28d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -119,7 +119,7 @@ 110 i386 iopl sys_iopl 111 i386 vhangup sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old sys32_vm86_warning +113 i386 vm86old sys_vm86old sys_ni_syscall 114 i386 wait4 sys_wait4 compat_sys_wait4 115 i386 swapoff sys_swapoff 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo @@ -172,7 +172,7 @@ 163 i386 mremap sys_mremap 164 i386 setresuid sys_setresuid16 165 i386 getresuid sys_getresuid16 -166 i386 vm86 sys_vm86 sys32_vm86_warning +166 i386 vm86 sys_vm86 sys_ni_syscall 167 i386 query_module 168 i386 poll sys_poll 169 i386 nfsservctl From 69e8544cd0056e02965ffb5e8414fb7501a2ee2e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:24 -0800 Subject: [PATCH 032/148] x86/asm/64: Open-code register save/restore in trace_hardirqs*() thunks This is a preparatory patch for change in "struct pt_regs" handling in entry_64.S. trace_hardirqs*() thunks were (ab)using a part of the 'pt_regs' handling code, namely the SAVE_ARGS/RESTORE_ARGS macros, to save/restore registers across C function calls. Since SAVE_ARGS is going to be changed, open-code register saving/restoring here. Incidentally, this removes a bit of dead code: one SAVE_ARGS was used just to emit a CFI annotation, but it also generated unreachable assembly instructions. Take a page from thunk_32.S and use push/pop instructions instead of movq, they are far shorter: 1 or 2 bytes versus 5, and no need for instructions to adjust %rsp: text data bss dec hex filename 333 40 0 373 175 thunk_64_movq.o 104 40 0 144 90 thunk_64_push_pop.o [ This is ugly as sin, but we'll fix up the ugliness in the next patch. I see no point in reordering patches just to avoid an ugly intermediate state. --Andy ] Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1420927210-19738-4-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/4c979ad604f0f02c5ade3b3da308b53eabd5e198.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/lib/thunk_64.S | 46 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index b30b5ebd614a..8ec443a0777b 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -17,9 +17,27 @@ CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - SAVE_ARGS + pushq_cfi %rdi + CFI_REL_OFFSET rdi, 0 + pushq_cfi %rsi + CFI_REL_OFFSET rsi, 0 + pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 + pushq_cfi %rcx + CFI_REL_OFFSET rcx, 0 + pushq_cfi %rax + CFI_REL_OFFSET rax, 0 + pushq_cfi %r8 + CFI_REL_OFFSET r8, 0 + pushq_cfi %r9 + CFI_REL_OFFSET r9, 0 + pushq_cfi %r10 + CFI_REL_OFFSET r10, 0 + pushq_cfi %r11 + CFI_REL_OFFSET r11, 0 .if \put_ret_addr_in_rdi + /* 9*8(%rsp) is return addr on stack */ movq_cfi_restore 9*8, rdi .endif @@ -45,11 +63,31 @@ #endif #endif - /* SAVE_ARGS below is used only for the .cfi directives it contains. */ +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) CFI_STARTPROC - SAVE_ARGS + CFI_ADJUST_CFA_OFFSET 9*8 restore: - RESTORE_ARGS + popq_cfi %r11 + CFI_RESTORE r11 + popq_cfi %r10 + CFI_RESTORE r10 + popq_cfi %r9 + CFI_RESTORE r9 + popq_cfi %r8 + CFI_RESTORE r8 + popq_cfi %rax + CFI_RESTORE rax + popq_cfi %rcx + CFI_RESTORE rcx + popq_cfi %rdx + CFI_RESTORE rdx + popq_cfi %rsi + CFI_RESTORE rsi + popq_cfi %rdi + CFI_RESTORE rdi ret CFI_ENDPROC _ASM_NOKPROBE(restore) +#endif From 49db46a67bec9ca9e29ece4729a876195877af50 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:25 -0800 Subject: [PATCH 033/148] x86/asm: Introduce push/pop macros which generate CFI_REL_OFFSET and CFI_RESTORE Sequences: pushl_cfi %reg CFI_REL_OFFSET reg, 0 and: popl_cfi %reg CFI_RESTORE reg happen quite often. This patch adds macros which generate them. No assembly changes (verified with objdump -dr vmlinux.o). Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1421017655-25561-1-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/2202eb90f175cf45d1b2d1c64dbb5676a8ad07ad.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 42 ++++++++---------------- arch/x86/include/asm/dwarf2.h | 24 ++++++++++++++ arch/x86/kernel/entry_32.S | 21 ++++-------- arch/x86/lib/atomic64_cx8_32.S | 50 ++++++++++++---------------- arch/x86/lib/checksum_32.S | 60 ++++++++++++---------------------- arch/x86/lib/msr-reg.S | 24 +++++++------- arch/x86/lib/rwsem.S | 44 ++++++++++++------------- arch/x86/lib/thunk_32.S | 18 ++++------ arch/x86/lib/thunk_64.S | 54 ++++++++++-------------------- 9 files changed, 141 insertions(+), 196 deletions(-) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1f1297b46f83..3c711f2ab236 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -210,37 +210,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx .endm .macro RESTORE_ALL - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg ebx + popl_cfi_reg ecx + popl_cfi_reg edx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebp + popl_cfi_reg eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -86,11 +86,23 @@ CFI_ADJUST_CFA_OFFSET 8 .endm + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popq_cfi reg popq \reg CFI_ADJUST_CFA_OFFSET -8 .endm + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + .macro pushfq_cfi pushfq CFI_ADJUST_CFA_OFFSET 8 @@ -116,11 +128,23 @@ CFI_ADJUST_CFA_OFFSET 4 .endm + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popl_cfi reg popl \reg CFI_ADJUST_CFA_OFFSET -4 .endm + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + .macro pushfl_cfi pushfl CFI_ADJUST_CFA_OFFSET 4 diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 7e0323cc0b7d..e33ba51b1069 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -1234,20 +1234,13 @@ error_code: /*CFI_REL_OFFSET es, 0*/ pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index f5cc9eb1d51b..082a85167a5b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -13,16 +13,6 @@ #include #include -.macro SAVE reg - pushl_cfi %\reg - CFI_REL_OFFSET \reg, 0 -.endm - -.macro RESTORE reg - popl_cfi %\reg - CFI_RESTORE \reg -.endm - .macro read64 reg movl %ebx, %eax movl %ecx, %edx @@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) CFI_STARTPROC - SAVE ebp - SAVE ebx - SAVE esi - SAVE edi + pushl_cfi_reg ebp + pushl_cfi_reg ebx + pushl_cfi_reg esi + pushl_cfi_reg edi movl %eax, %esi movl %edx, %edi @@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - RESTORE edi - RESTORE esi - RESTORE ebx - RESTORE ebp + popl_cfi_reg edi + popl_cfi_reg esi + popl_cfi_reg ebx + popl_cfi_reg ebp ret CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) @@ -104,7 +94,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) @@ -130,7 +120,7 @@ incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) CFI_STARTPROC - SAVE ebp - SAVE ebx + pushl_cfi_reg ebp + pushl_cfi_reg ebx /* these just push these two parameters on the stack */ - SAVE edi - SAVE ecx + pushl_cfi_reg edi + pushl_cfi_reg ecx movl %eax, %ebp movl %edx, %edi @@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8) 3: addl $8, %esp CFI_ADJUST_CFA_OFFSET -8 - RESTORE ebx - RESTORE ebp + popl_cfi_reg ebx + popl_cfi_reg ebp ret 4: cmpl %edx, 4(%esp) @@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index e78b8eee6615..c3b9953d3fa0 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg esi + pushl_cfi_reg ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -131,10 +129,8 @@ ENTRY(csum_partial) jz 8f roll $8, %eax 8: - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi + popl_cfi_reg ebx + popl_cfi_reg esi ret CFI_ENDPROC ENDPROC(csum_partial) @@ -145,10 +141,8 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg esi + pushl_cfi_reg ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -255,10 +249,8 @@ ENTRY(csum_partial) jz 90f roll $8, %eax 90: - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi + popl_cfi_reg ebx + popl_cfi_reg esi ret CFI_ENDPROC ENDPROC(csum_partial) @@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -412,12 +401,9 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi + popl_cfi_reg ebx + popl_cfi_reg esi + popl_cfi_reg edi popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC @@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 + pushl_cfi_reg ebx + pushl_cfi_reg edi + pushl_cfi_reg esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -506,12 +489,9 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebx - CFI_RESTORE ebx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index f6d13eefad10..3ca5218fbece 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -14,8 +14,8 @@ .macro op_safe_regs op ENTRY(\op\()_safe_regs) CFI_STARTPROC - pushq_cfi %rbx - pushq_cfi %rbp + pushq_cfi_reg rbx + pushq_cfi_reg rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi %rbp - popq_cfi %rbx + popq_cfi_reg rbp + popq_cfi_reg rbx ret 3: CFI_RESTORE_STATE @@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) CFI_STARTPROC - pushl_cfi %ebx - pushl_cfi %ebp - pushl_cfi %esi - pushl_cfi %edi + pushl_cfi_reg ebx + pushl_cfi_reg ebp + pushl_cfi_reg esi + pushl_cfi_reg edi pushl_cfi $0 /* Return value */ pushl_cfi %eax movl 4(%eax), %ecx @@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs) movl %esi, 24(%eax) movl %edi, 28(%eax) popl_cfi %eax - popl_cfi %edi - popl_cfi %esi - popl_cfi %ebp - popl_cfi %ebx + popl_cfi_reg edi + popl_cfi_reg esi + popl_cfi_reg ebp + popl_cfi_reg ebx ret 3: CFI_RESTORE_STATE diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 5dff5f042468..2322abe4da3b 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -34,10 +34,10 @@ */ #define save_common_regs \ - pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 + pushl_cfi_reg ecx #define restore_common_regs \ - popl_cfi %ecx; CFI_RESTORE ecx + popl_cfi_reg ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,22 +64,22 @@ */ #define save_common_regs \ - pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ - pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ - pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ - pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ - pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ - pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ - pushq_cfi %r11; CFI_REL_OFFSET r11, 0 + pushq_cfi_reg rdi; \ + pushq_cfi_reg rsi; \ + pushq_cfi_reg rcx; \ + pushq_cfi_reg r8; \ + pushq_cfi_reg r9; \ + pushq_cfi_reg r10; \ + pushq_cfi_reg r11 #define restore_common_regs \ - popq_cfi %r11; CFI_RESTORE r11; \ - popq_cfi %r10; CFI_RESTORE r10; \ - popq_cfi %r9; CFI_RESTORE r9; \ - popq_cfi %r8; CFI_RESTORE r8; \ - popq_cfi %rcx; CFI_RESTORE rcx; \ - popq_cfi %rsi; CFI_RESTORE rsi; \ - popq_cfi %rdi; CFI_RESTORE rdi + popq_cfi_reg r11; \ + popq_cfi_reg r10; \ + popq_cfi_reg r9; \ + popq_cfi_reg r8; \ + popq_cfi_reg rcx; \ + popq_cfi_reg rsi; \ + popq_cfi_reg rdi #endif @@ -87,12 +87,10 @@ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi) %__ASM_REG(dx) - CFI_REL_OFFSET __ASM_REG(dx), 0 + __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) - CFI_RESTORE __ASM_REG(dx) + __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) restore_common_regs ret CFI_ENDPROC @@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi) %__ASM_REG(dx) - CFI_REL_OFFSET __ASM_REG(dx), 0 + __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) - CFI_RESTORE __ASM_REG(dx) + __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) restore_common_regs ret CFI_ENDPROC diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index e28cdaf5ac2c..5eb715087b80 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -13,12 +13,9 @@ .globl \name \name: CFI_STARTPROC - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ecx + pushl_cfi_reg edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -26,12 +23,9 @@ .endif call \func - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg edx + popl_cfi_reg ecx + popl_cfi_reg eax ret CFI_ENDPROC _ASM_NOKPROBE(\name) diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index 8ec443a0777b..f89ba4e93025 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -17,24 +17,15 @@ CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - pushq_cfi %rdi - CFI_REL_OFFSET rdi, 0 - pushq_cfi %rsi - CFI_REL_OFFSET rsi, 0 - pushq_cfi %rdx - CFI_REL_OFFSET rdx, 0 - pushq_cfi %rcx - CFI_REL_OFFSET rcx, 0 - pushq_cfi %rax - CFI_REL_OFFSET rax, 0 - pushq_cfi %r8 - CFI_REL_OFFSET r8, 0 - pushq_cfi %r9 - CFI_REL_OFFSET r9, 0 - pushq_cfi %r10 - CFI_REL_OFFSET r10, 0 - pushq_cfi %r11 - CFI_REL_OFFSET r11, 0 + pushq_cfi_reg rdi + pushq_cfi_reg rsi + pushq_cfi_reg rdx + pushq_cfi_reg rcx + pushq_cfi_reg rax + pushq_cfi_reg r8 + pushq_cfi_reg r9 + pushq_cfi_reg r10 + pushq_cfi_reg r11 .if \put_ret_addr_in_rdi /* 9*8(%rsp) is return addr on stack */ @@ -69,24 +60,15 @@ CFI_STARTPROC CFI_ADJUST_CFA_OFFSET 9*8 restore: - popq_cfi %r11 - CFI_RESTORE r11 - popq_cfi %r10 - CFI_RESTORE r10 - popq_cfi %r9 - CFI_RESTORE r9 - popq_cfi %r8 - CFI_RESTORE r8 - popq_cfi %rax - CFI_RESTORE rax - popq_cfi %rcx - CFI_RESTORE rcx - popq_cfi %rdx - CFI_RESTORE rdx - popq_cfi %rsi - CFI_RESTORE rsi - popq_cfi %rdi - CFI_RESTORE rdi + popq_cfi_reg r11 + popq_cfi_reg r10 + popq_cfi_reg r9 + popq_cfi_reg r8 + popq_cfi_reg rax + popq_cfi_reg rcx + popq_cfi_reg rdx + popq_cfi_reg rsi + popq_cfi_reg rdi ret CFI_ENDPROC _ASM_NOKPROBE(restore) From 6e1327bd2b20ccb387fcddc0caa605cb253cc458 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:26 -0800 Subject: [PATCH 034/148] x86/asm/entry/64: Fix incorrect symbolic constant usage: R11->ARGOFFSET Since the last fix of this nature, a few more instances have crept in. Fix them up. No object code changes (constants have the same value). Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1423778052-21038-1-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/f5e1c4084319a42e5f14d41e2d638949ce66bc08.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 10074ad9ebf8..a57b3387ec7d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -757,8 +757,8 @@ retint_swapgs: /* return to user-space */ * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq (RCX-R11)(%rsp), %rcx - cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ + movq (RCX-ARGOFFSET)(%rsp), %rcx + cmpq %rcx,(RIP-ARGOFFSET)(%rsp) /* RCX == RIP */ jne opportunistic_sysret_failed /* @@ -779,7 +779,7 @@ retint_swapgs: /* return to user-space */ shr $__VIRTUAL_MASK_SHIFT, %rcx jnz opportunistic_sysret_failed - cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ + cmpq $__USER_CS,(CS-ARGOFFSET)(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed movq (R11-ARGOFFSET)(%rsp), %r11 From 76f5df43cab5e765c0bd42289103e8f625813ae1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:27 -0800 Subject: [PATCH 035/148] x86/asm/entry/64: Always allocate a complete "struct pt_regs" on the kernel stack The 64-bit entry code was using six stack slots less by not saving/restoring registers which are callee-preserved according to the C ABI, and was not allocating space for them. Only when syscalls needed a complete "struct pt_regs" was the complete area allocated and filled in. As an additional twist, on interrupt entry a "slightly less truncated pt_regs" trick is used, to make nested interrupt stacks easier to unwind. This proved to be a source of significant obfuscation and subtle bugs. For example, 'stub_fork' had to pop the return address, extend the struct, save registers, and push return address back. Ugly. 'ia32_ptregs_common' pops return address and "returns" via jmp insn, throwing a wrench into CPU return stack cache. This patch changes the code to always allocate a complete "struct pt_regs" on the kernel stack. The saving of registers is still done lazily. "Partial pt_regs" trick on interrupt stack is retained. Macros which manipulate "struct pt_regs" on stack are reworked: - ALLOC_PT_GPREGS_ON_STACK allocates the structure. - SAVE_C_REGS saves to it those registers which are clobbered by C code. - SAVE_EXTRA_REGS saves to it all other registers. - Corresponding RESTORE_* and REMOVE_PT_GPREGS_FROM_STACK macros reverse it. 'ia32_ptregs_common', 'stub_fork' and friends lost their ugly dance with the return pointer. LOAD_ARGS32 in ia32entry.S now uses symbolic stack offsets instead of magic numbers. 'error_entry' and 'save_paranoid' now use SAVE_C_REGS + SAVE_EXTRA_REGS instead of having it open-coded yet again. Patch was run-tested: 64-bit executables, 32-bit executables, strace works. Timing tests did not show measurable difference in 32-bit and 64-bit syscalls. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1423778052-21038-2-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/b89763d354aa23e670b9bdf3a40ae320320a7c2e.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 47 ++--- arch/x86/include/asm/calling.h | 234 ++++++++++++------------- arch/x86/include/asm/irqflags.h | 4 +- arch/x86/include/uapi/asm/ptrace-abi.h | 1 - arch/x86/kernel/entry_64.S | 194 ++++++++------------ 5 files changed, 215 insertions(+), 265 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebcab4ada..f4bed4971673 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -62,12 +62,12 @@ */ .macro LOAD_ARGS32 offset, _r9=0 .if \_r9 - movl \offset+16(%rsp),%r9d + movl \offset+R9(%rsp),%r9d .endif - movl \offset+40(%rsp),%ecx - movl \offset+48(%rsp),%edx - movl \offset+56(%rsp),%esi - movl \offset+64(%rsp),%edi + movl \offset+RCX(%rsp),%ecx + movl \offset+RDX(%rsp),%edx + movl \offset+RSI(%rsp),%esi + movl \offset+RDI(%rsp),%edi movl %eax,%eax /* zero extension */ .endm @@ -144,7 +144,8 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rip,0 pushq_cfi %rax cld - SAVE_ARGS 0,1,0 + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS_EXCEPT_R891011 /* no need to do an access_ok check here because rbp has been 32bit zero extended */ ASM_STAC @@ -182,7 +183,8 @@ sysexit_from_sys_call: andl $~0x200,EFLAGS-ARGOFFSET(%rsp) movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx - RESTORE_ARGS 0,24,0,0,0,0 + RESTORE_RSI_RDI + REMOVE_PT_GPREGS_FROM_STACK 3*8 xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 @@ -256,13 +258,13 @@ sysenter_tracesys: testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz sysenter_auditsys #endif - SAVE_REST + SAVE_EXTRA_REGS CLEAR_RREGS movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST + RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -304,7 +306,8 @@ ENTRY(ia32_cstar_target) * disabled irqs and here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8,0,0 + ALLOC_PT_GPREGS_ON_STACK 8 + SAVE_C_REGS_EXCEPT_RCX_R891011 movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) @@ -341,7 +344,7 @@ cstar_dispatch: jnz sysretl_audit sysretl_from_sys_call: andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - RESTORE_ARGS 0,-ARG_SKIP,0,0,0 + RESTORE_RSI_RDI_RDX movl RIP-ARGOFFSET(%rsp),%ecx CFI_REGISTER rip,rcx movl EFLAGS-ARGOFFSET(%rsp),%r11d @@ -372,13 +375,13 @@ cstar_tracesys: jz cstar_auditsys #endif xchgl %r9d,%ebp - SAVE_REST + SAVE_EXTRA_REGS CLEAR_RREGS 0, r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ - RESTORE_REST + RESTORE_EXTRA_REGS xchgl %ebp,%r9d cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ @@ -433,7 +436,8 @@ ENTRY(ia32_syscall) cld /* note the registers are not zero extended to the sf. this could be a problem. */ - SAVE_ARGS 0,1,0 + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS_EXCEPT_R891011 orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz ia32_tracesys @@ -446,16 +450,16 @@ ia32_sysret: movq %rax,RAX-ARGOFFSET(%rsp) ia32_ret_from_sys_call: CLEAR_RREGS -ARGOFFSET - jmp int_ret_from_sys_call + jmp int_ret_from_sys_call -ia32_tracesys: - SAVE_REST +ia32_tracesys: + SAVE_EXTRA_REGS CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST + RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call @@ -492,7 +496,6 @@ GLOBAL(stub32_clone) ALIGN ia32_ptregs_common: - popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple CFI_SIGNAL_FRAME @@ -507,9 +510,9 @@ ia32_ptregs_common: /* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ CFI_REL_OFFSET rsp,RSP-ARGOFFSET /* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ - SAVE_REST + SAVE_EXTRA_REGS 8 call *%rax - RESTORE_REST - jmp ia32_sysret /* misbalances the return cache */ + RESTORE_EXTRA_REGS 8 + ret CFI_ENDPROC END(ia32_ptregs_common) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 3c711f2ab236..38356476b131 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -55,143 +55,137 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 -/* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ -/* end of arguments */ - -/* cpu exception frame or undefined in case of fast syscall: */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 - -#define ARGOFFSET R11 - - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 - subq $9*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 9*8+\addskip - movq_cfi rdi, 8*8 - movq_cfi rsi, 7*8 - movq_cfi rdx, 6*8 - - .if \save_rcx - movq_cfi rcx, 5*8 - .endif - - .if \rax_enosys - movq $-ENOSYS, 4*8(%rsp) - .else - movq_cfi rax, 4*8 - .endif - - .if \save_r891011 - movq_cfi r8, 3*8 - movq_cfi r9, 2*8 - movq_cfi r10, 1*8 - movq_cfi r11, 0*8 - .endif +#define ARGOFFSET 0 + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + subq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET 15*8+\addskip .endm -#define ARG_SKIP (9*8) + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8plus=1 + .if \r8plus + movq_cfi r11, 6*8+\offset + movq_cfi r10, 7*8+\offset + movq_cfi r9, 8*8+\offset + movq_cfi r8, 9*8+\offset + .endif + .if \rax + movq_cfi rax, 10*8+\offset + .endif + .if \rcx + movq_cfi rcx, 11*8+\offset + .endif + movq_cfi rdx, 12*8+\offset + movq_cfi rsi, 13*8+\offset + movq_cfi rdi, 14*8+\offset + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0 + .endm - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ - rstor_r8910=1, rstor_rdx=1 + .macro SAVE_EXTRA_REGS offset=0 + movq_cfi r15, 0*8+\offset + movq_cfi r14, 1*8+\offset + movq_cfi r13, 2*8+\offset + movq_cfi r12, 3*8+\offset + movq_cfi rbp, 4*8+\offset + movq_cfi rbx, 5*8+\offset + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq_cfi rbp, 4*8+\offset + .endm + + .macro RESTORE_EXTRA_REGS offset=0 + movq_cfi_restore 0*8+\offset, r15 + movq_cfi_restore 1*8+\offset, r14 + movq_cfi_restore 2*8+\offset, r13 + movq_cfi_restore 3*8+\offset, r12 + movq_cfi_restore 4*8+\offset, rbp + movq_cfi_restore 5*8+\offset, rbx + .endm + + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 0*8, r11 + movq_cfi_restore 6*8, r11 .endif - .if \rstor_r8910 - movq_cfi_restore 1*8, r10 - movq_cfi_restore 2*8, r9 - movq_cfi_restore 3*8, r8 + movq_cfi_restore 7*8, r10 + movq_cfi_restore 8*8, r9 + movq_cfi_restore 9*8, r8 .endif - .if \rstor_rax - movq_cfi_restore 4*8, rax + movq_cfi_restore 10*8, rax .endif - .if \rstor_rcx - movq_cfi_restore 5*8, rcx + movq_cfi_restore 11*8, rcx .endif - .if \rstor_rdx - movq_cfi_restore 6*8, rdx - .endif - - movq_cfi_restore 7*8, rsi - movq_cfi_restore 8*8, rdi - - .if ARG_SKIP+\addskip > 0 - addq $ARG_SKIP+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) + movq_cfi_restore 12*8, rdx .endif + movq_cfi_restore 13*8, rsi + movq_cfi_restore 14*8, rdi + .endm + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 .endm - .macro LOAD_ARGS offset, skiprax=0 - movq \offset(%rsp), %r11 - movq \offset+8(%rsp), %r10 - movq \offset+16(%rsp), %r9 - movq \offset+24(%rsp), %r8 - movq \offset+40(%rsp), %rcx - movq \offset+48(%rsp), %rdx - movq \offset+56(%rsp), %rsi - movq \offset+64(%rsp), %rdi - .if \skiprax - .else - movq \offset+72(%rsp), %rax - .endif - .endm - -#define REST_SKIP (6*8) - - .macro SAVE_REST - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq_cfi rbx, 5*8 - movq_cfi rbp, 4*8 - movq_cfi r12, 3*8 - movq_cfi r13, 2*8 - movq_cfi r14, 1*8 - movq_cfi r15, 0*8 - .endm - - .macro RESTORE_REST - movq_cfi_restore 0*8, r15 - movq_cfi_restore 1*8, r14 - movq_cfi_restore 2*8, r13 - movq_cfi_restore 3*8, r12 - movq_cfi_restore 4*8, rbp - movq_cfi_restore 5*8, rbx - addq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET -(REST_SKIP) - .endm - - .macro SAVE_ALL - SAVE_ARGS - SAVE_REST - .endm - - .macro RESTORE_ALL addskip=0 - RESTORE_REST - RESTORE_ARGS 1, \addskip + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + addq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..021bee9b86b6 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void) #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_REST; \ + SAVE_EXTRA_REGS; \ LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ + RESTORE_EXTRA_REGS; \ cli; \ TRACE_IRQS_OFF; diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a88851..ad115bf779f3 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -49,7 +49,6 @@ #define EFLAGS 144 #define RSP 152 #define SS 160 -#define ARGOFFSET R11 #endif /* __ASSEMBLY__ */ /* top of stack page */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a57b3387ec7d..e8372e08f8c2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -26,12 +26,6 @@ * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better * backtraces. They don't change any code. - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. - * There are unfortunately lots of special cases where some registers - * not touched. The macro is a big mess that should be cleaned up. - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. - * Gives a full stack frame. * - ENTRY/END Define functions in the symbol table. * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL @@ -190,9 +184,9 @@ ENDPROC(native_usergs_sysret64) .endm /* - * frame that enables calling into C. + * frame that enables passing a complete pt_regs to a C function. */ - .macro PARTIAL_FRAME start=1 offset=0 + .macro DEFAULT_FRAME start=1 offset=0 XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET @@ -203,13 +197,6 @@ ENDPROC(native_usergs_sysret64) CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET - .endm - -/* - * frame that enables passing a complete pt_regs to a C function. - */ - .macro DEFAULT_FRAME start=1 offset=0 - PARTIAL_FRAME \start, R11+\offset-R15 CFI_REL_OFFSET rbx, RBX+\offset CFI_REL_OFFSET rbp, RBP+\offset CFI_REL_OFFSET r12, R12+\offset @@ -221,21 +208,8 @@ ENDPROC(native_usergs_sysret64) ENTRY(save_paranoid) XCPT_FRAME 1 RDI+8 cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 movl $1,%ebx movl $MSR_GS_BASE,%ecx rdmsr @@ -264,7 +238,7 @@ ENTRY(ret_from_fork) GET_THREAD_INFO(%rcx) - RESTORE_REST + RESTORE_EXTRA_REGS testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? jz 1f @@ -276,12 +250,10 @@ ENTRY(ret_from_fork) jmp ret_from_sys_call # go to the SYSRET fastpath 1: - subq $REST_SKIP, %rsp # leave space for volatiles - CFI_ADJUST_CFA_OFFSET REST_SKIP movq %rbp, %rdi call *%rbx movl $0, RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(ret_from_fork) @@ -339,9 +311,11 @@ GLOBAL(system_call_after_swapgs) * and short: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8, 0, rax_enosys=1 + ALLOC_PT_GPREGS_ON_STACK 8 + SAVE_C_REGS_EXCEPT_RAX_RCX + movq $-ENOSYS,RAX-ARGOFFSET(%rsp) movq_cfi rax,(ORIG_RAX-ARGOFFSET) - movq %rcx,RIP-ARGOFFSET(%rsp) + movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys @@ -372,9 +346,9 @@ ret_from_sys_call: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON + RESTORE_C_REGS_EXCEPT_RCX movq RIP-ARGOFFSET(%rsp),%rcx CFI_REGISTER rip,rcx - RESTORE_ARGS 1,-ARG_SKIP,0 /*CFI_REGISTER rflags,r11*/ movq PER_CPU_VAR(old_rsp), %rsp USERGS_SYSRET64 @@ -387,16 +361,17 @@ int_ret_from_sys_call_fixup: /* Do syscall tracing */ tracesys: - leaq -REST_SKIP(%rsp), %rdi + movq %rsp, %rdi movq $AUDIT_ARCH_X86_64, %rsi call syscall_trace_enter_phase1 test %rax, %rax jnz tracesys_phase2 /* if needed, run the slow path */ - LOAD_ARGS 0 /* else restore clobbered regs */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX-ARGOFFSET(%rsp), %rax jmp system_call_fastpath /* and return to the fast path */ tracesys_phase2: - SAVE_REST + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %rdi movq %rsp, %rdi movq $AUDIT_ARCH_X86_64, %rsi @@ -408,8 +383,8 @@ tracesys_phase2: * We don't reload %rax because syscall_trace_entry_phase2() returned * the value it wants us to use in the table lookup. */ - LOAD_ARGS ARGOFFSET, 1 - RESTORE_REST + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else @@ -460,7 +435,7 @@ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) int_check_syscall_exit_work: - SAVE_REST + SAVE_EXTRA_REGS /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal @@ -479,7 +454,7 @@ int_signal: call do_notify_resume 1: movl $_TIF_WORK_MASK,%edi int_restore_rest: - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -489,15 +464,12 @@ END(system_call) .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC - popq %r11 /* save return address */ - PARTIAL_FRAME 0 - SAVE_REST - pushq %r11 /* put it back on stack */ + DEFAULT_FRAME 0, 8 /* offset 8: return address */ + SAVE_EXTRA_REGS 8 FIXUP_TOP_OF_STACK %r11, 8 - DEFAULT_FRAME 0 8 /* offset 8: return address */ call sys_\func RESTORE_TOP_OF_STACK %r11, 8 - ret $REST_SKIP /* pop extended registers */ + ret CFI_ENDPROC END(stub_\func) .endm @@ -505,7 +477,7 @@ END(stub_\func) .macro FIXED_FRAME label,func ENTRY(\label) CFI_STARTPROC - PARTIAL_FRAME 0 8 /* offset 8: return address */ + DEFAULT_FRAME 0, 8 /* offset 8: return address */ FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET call \func RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET @@ -522,12 +494,12 @@ END(\label) ENTRY(stub_execve) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys_execve movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) @@ -535,13 +507,13 @@ END(stub_execve) ENTRY(stub_execveat) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys_execveat RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execveat) @@ -553,12 +525,12 @@ END(stub_execveat) ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_rt_sigreturn) @@ -567,12 +539,12 @@ END(stub_rt_sigreturn) ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys32_x32_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_x32_rt_sigreturn) @@ -580,13 +552,13 @@ END(stub_x32_rt_sigreturn) ENTRY(stub_x32_execve) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call compat_sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_x32_execve) @@ -594,13 +566,13 @@ END(stub_x32_execve) ENTRY(stub_x32_execveat) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call compat_sys_execveat RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_x32_execveat) @@ -656,42 +628,28 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - /* reserve pt_regs for scratch regs and rbp */ - subq $ORIG_RAX-RBP, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP cld - /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, (RDI-RBP) - movq_cfi rsi, (RSI-RBP) - movq_cfi rdx, (RDX-RBP) - movq_cfi rcx, (RCX-RBP) - movq_cfi rax, (RAX-RBP) - movq_cfi r8, (R8-RBP) - movq_cfi r9, (R9-RBP) - movq_cfi r10, (R10-RBP) - movq_cfi r11, (R11-RBP) + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP - /* Save rbp so that we can unwind from get_irq_regs() */ - movq_cfi rbp, 0 + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - /* Save previous stack value */ - movq %rsp, %rsi - - leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS-RBP(%rsi) + testl $3, CS-RBP(%rsp) je 1f SWAPGS +1: /* * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ -1: incl PER_CPU_VAR(irq_count) + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi - - /* Store previous stack value */ pushq %rsi CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 0x77 /* DW_OP_breg7 */, 0, \ @@ -800,7 +758,8 @@ retint_swapgs: /* return to user-space */ */ irq_return_via_sysret: CFI_REMEMBER_STATE - RESTORE_ARGS 1,8,1 + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 movq (RSP-RIP)(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE @@ -816,7 +775,8 @@ retint_restore_args: /* return to kernel space */ */ TRACE_IRQS_IRETQ restore_args: - RESTORE_ARGS 1,8,1 + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 irq_return: INTERRUPT_RETURN @@ -887,12 +847,12 @@ retint_signal: jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_REST + SAVE_EXTRA_REGS movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -1019,8 +979,7 @@ ENTRY(\sym) pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ .endif - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 @@ -1269,7 +1228,9 @@ ENTRY(xen_failsafe_callback) addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS jmp error_exit CFI_ENDPROC END(xen_failsafe_callback) @@ -1321,11 +1282,15 @@ ENTRY(paranoid_exit) jnz paranoid_restore TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK - RESTORE_ALL 8 + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN paranoid_restore: TRACE_IRQS_IRETQ_DEBUG 0 - RESTORE_ALL 8 + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN CFI_ENDPROC END(paranoid_exit) @@ -1339,21 +1304,8 @@ ENTRY(error_entry) CFI_ADJUST_CFA_OFFSET 15*8 /* oldrax contains error code */ cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq %rdx, RDX+8(%rsp) - movq %rcx, RCX+8(%rsp) - movq %rax, RAX+8(%rsp) - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1402,7 +1354,7 @@ END(error_entry) ENTRY(error_exit) DEFAULT_FRAME movl %ebx,%eax - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -1621,8 +1573,8 @@ end_repeat_nmi: * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK + /* * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit * as we should not be calling schedule in NMI context. @@ -1661,8 +1613,10 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS /* Pop the extra iret frame at once */ - RESTORE_ALL 6*8 + REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) From e90e147cbc0cbc8dcf48000e15190badf75250f4 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:28 -0800 Subject: [PATCH 036/148] x86/asm/entry/64: Fix comments - Misleading and slightly incorrect comments in "struct pt_regs" are fixed (four instances). - Fix incorrect comment atop EMPTY_FRAME macro. - Explain in more detail what we do with stack layout during hw interrupt. - Correct comments about "partial stack frame" which are no longer true. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1423778052-21038-3-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/e1f4429c491fe6ceeddb879dea2786e0f8920f9c.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 13 ++++++++++--- arch/x86/include/uapi/asm/ptrace-abi.h | 15 +++++++++++---- arch/x86/include/uapi/asm/ptrace.h | 13 ++++++++++--- arch/x86/kernel/entry_64.S | 18 ++++++++++++------ 4 files changed, 43 insertions(+), 16 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..4077d963a1a0 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -31,13 +31,17 @@ struct pt_regs { #else /* __i386__ */ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -47,9 +51,12 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_ax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index ad115bf779f3..580aee3072e0 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,13 +25,17 @@ #else /* __i386__ */ #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ #define R15 0 #define R14 8 #define R13 16 #define R12 24 #define RBP 32 #define RBX 40 -/* arguments: interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ #define R11 48 #define R10 56 #define R9 64 @@ -41,9 +45,12 @@ #define RDX 96 #define RSI 104 #define RDI 112 -#define ORIG_RAX 120 /* = ERROR */ -/* end of arguments */ -/* cpu exception frame or undefined in case of fast syscall. */ +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 120 +/* Return frame for iretq */ #define RIP 128 #define CS 136 #define EFLAGS 144 diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index ac4b9aa4d999..bc16115af39b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -41,13 +41,17 @@ struct pt_regs { #ifndef __KERNEL__ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long rbp; unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -57,9 +61,12 @@ struct pt_regs { unsigned long rdx; unsigned long rsi; unsigned long rdi; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_rax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long rip; unsigned long cs; unsigned long eflags; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e8372e08f8c2..695f4d434a84 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -14,9 +14,6 @@ * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * * A note on terminology: * - top of stack: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. @@ -151,7 +148,7 @@ ENDPROC(native_usergs_sysret64) .endm /* - * initial frame state for interrupts (and exceptions without error code) + * empty frame */ .macro EMPTY_FRAME start=1 offset=0 .if \start @@ -379,7 +376,7 @@ tracesys_phase2: call syscall_trace_enter_phase2 /* - * Reload arg registers from stack in case ptrace changed them. + * Reload registers from stack in case ptrace changed them. * We don't reload %rax because syscall_trace_entry_phase2() returned * the value it wants us to use in the table lookup. */ @@ -629,6 +626,13 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ ALLOC_PT_GPREGS_ON_STACK -RBP SAVE_C_REGS -RBP /* this goes to 0(%rsp) for unwinder, not for saving the value: */ @@ -641,6 +645,7 @@ END(interrupt) SWAPGS 1: /* + * Save previous stack pointer, optionally switch to interrupt stack. * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is * a little cheaper to use a separate counter in the PDA (short of @@ -681,6 +686,7 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ + /* return code expects complete pt_regs - adjust rsp accordingly: */ leaq ARGOFFSET-RBP(%rsi), %rsp CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET @@ -692,7 +698,7 @@ exit_intr: /* Interrupt came from user space */ /* - * Has a correct top of stack, but a partial stack frame + * Has a correct top of stack. * %rcx: thread info. Interrupts off. */ retint_with_reschedule: From 0d55083698ed2f498e5682c5c252e6b7224890be Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:29 -0800 Subject: [PATCH 037/148] x86/asm/entry/64: Shrink code in 'paranoid_exit' RESTORE_EXTRA_REGS + RESTORE_C_REGS looks small, but it's a lot of instructions (fourteen). Let's reuse them. Signed-off-by: Denys Vlasenko [ Cleaned up the labels. ] Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1421272101-16847-2-git-send-email-dvlasenk@redhat.com Link: http://lkml.kernel.org/r/59d71848cee3ec9eb48c0252e602efd6bd560e3c.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 695f4d434a84..8fafed9f462d 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1285,15 +1285,13 @@ ENTRY(paranoid_exit) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore + jnz paranoid_exit_no_swapgs TRACE_IRQS_IRETQ 0 SWAPGS_UNSAFE_STACK - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN -paranoid_restore: + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG 0 +paranoid_exit_restore: RESTORE_EXTRA_REGS RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 From f2db9382c1140914cfdef224ce907e443c9f9b81 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:30 -0800 Subject: [PATCH 038/148] x86/asm/entry: Do mass removal of 'ARGOFFSET' ARGOFFSET is zero now, removing it changes no code. A few macros lost "offset" parameter, since it is always zero now too. No code changes - verified with objdump. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/8689f937622d9d2db0ab8be82331fa15e4ed4713.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 142 ++++++++++++++++----------------- arch/x86/include/asm/calling.h | 2 - arch/x86/kernel/entry_64.S | 86 ++++++++++---------- 3 files changed, 114 insertions(+), 116 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index f4bed4971673..e99f8a5be2df 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -41,13 +41,13 @@ movl %edx,%edx /* zero extension */ .endm - /* clobbers %eax */ - .macro CLEAR_RREGS offset=0, _r9=rax + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax - movq %rax,\offset+R11(%rsp) - movq %rax,\offset+R10(%rsp) - movq %\_r9,\offset+R9(%rsp) - movq %rax,\offset+R8(%rsp) + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) .endm /* @@ -60,14 +60,14 @@ * If it's -1 to make us punt the syscall, then (u32)-1 is still * an appropriately invalid value. */ - .macro LOAD_ARGS32 offset, _r9=0 + .macro LOAD_ARGS32 _r9=0 .if \_r9 - movl \offset+R9(%rsp),%r9d + movl R9(%rsp),%r9d .endif - movl \offset+RCX(%rsp),%ecx - movl \offset+RDX(%rsp),%edx - movl \offset+RSI(%rsp),%esi - movl \offset+RDI(%rsp),%edi + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi movl %eax,%eax /* zero extension */ .endm @@ -158,12 +158,12 @@ ENTRY(ia32_sysenter_target) * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. */ - testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) + testl $X86_EFLAGS_NT,EFLAGS(%rsp) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -172,16 +172,16 @@ sysenter_do_call: IA32_ARG_FIXUP sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS-ARGOFFSET(%rsp) - movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ + andl $~0x200,EFLAGS(%rsp) + movl RIP(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx RESTORE_RSI_RDI REMOVE_PT_GPREGS_FROM_STACK 3*8 @@ -207,18 +207,18 @@ sysexit_from_sys_call: movl %ebx,%esi /* 2nd arg: 1st syscall arg */ movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + movl RAX(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -229,13 +229,13 @@ sysexit_from_sys_call: 1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ call __audit_syscall_exit - movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ + movq RAX(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl %edi,TI_flags+THREAD_INFO(%rsp,RIP) jz \exit - CLEAR_RREGS -ARGOFFSET + CLEAR_RREGS jmp int_with_check .endm @@ -255,7 +255,7 @@ sysenter_fix_flags: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) jz sysenter_auditsys #endif SAVE_EXTRA_REGS @@ -263,7 +263,7 @@ sysenter_tracesys: movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ @@ -309,17 +309,17 @@ ENTRY(ia32_cstar_target) ALLOC_PT_GPREGS_ON_STACK 8 SAVE_C_REGS_EXCEPT_RCX_R891011 movl %eax,%eax /* zero extension */ - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + movq %rax,ORIG_RAX(%rsp) + movq %rcx,RIP(%rsp) + CFI_REL_OFFSET rip,RIP + movq %rbp,RCX(%rsp) /* this lies slightly to ptrace */ movl %ebp,%ecx - movq $__USER32_CS,CS-ARGOFFSET(%rsp) - movq $__USER32_DS,SS-ARGOFFSET(%rsp) - movq %r11,EFLAGS-ARGOFFSET(%rsp) - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - movq %r8,RSP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rsp,RSP-ARGOFFSET + movq $__USER32_CS,CS(%rsp) + movq $__USER32_DS,SS(%rsp) + movq %r11,EFLAGS(%rsp) + /*CFI_REL_OFFSET rflags,EFLAGS*/ + movq %r8,RSP(%rsp) + CFI_REL_OFFSET rsp,RSP /* no need to do an access_ok check here because r8 has been 32bit zero extended */ /* hardware stack frame is complete now */ @@ -327,8 +327,8 @@ ENTRY(ia32_cstar_target) 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -337,32 +337,32 @@ cstar_do_call: IA32_ARG_FIXUP 1 cstar_dispatch: call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) RESTORE_RSI_RDI_RDX - movl RIP-ARGOFFSET(%rsp),%ecx + movl RIP(%rsp),%ecx CFI_REGISTER rip,rcx - movl EFLAGS-ARGOFFSET(%rsp),%r11d + movl EFLAGS(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ xorq %r10,%r10 xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON - movl RSP-ARGOFFSET(%rsp),%esp + movl RSP(%rsp),%esp CFI_RESTORE rsp USERGS_SYSRET32 #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: CFI_RESTORE_STATE - movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common - movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + movl R9(%rsp),%r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -371,16 +371,16 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) jz cstar_auditsys #endif xchgl %r9d,%ebp SAVE_EXTRA_REGS - CLEAR_RREGS 0, r9 + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS xchgl %ebp,%r9d cmpq $(IA32_NR_syscalls-1),%rax @@ -438,8 +438,8 @@ ENTRY(ia32_syscall) this could be a problem. */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS_EXCEPT_R891011 - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -447,9 +447,9 @@ ia32_do_call: IA32_ARG_FIXUP call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) ia32_ret_from_sys_call: - CLEAR_RREGS -ARGOFFSET + CLEAR_RREGS jmp int_ret_from_sys_call ia32_tracesys: @@ -458,7 +458,7 @@ ia32_tracesys: movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ @@ -466,7 +466,7 @@ ia32_tracesys: END(ia32_syscall) ia32_badsys: - movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $0,ORIG_RAX(%rsp) movq $-ENOSYS,%rax jmp ia32_sysret @@ -499,17 +499,17 @@ ia32_ptregs_common: CFI_ENDPROC CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-ARGOFFSET - CFI_REL_OFFSET rax,RAX-ARGOFFSET - CFI_REL_OFFSET rcx,RCX-ARGOFFSET - CFI_REL_OFFSET rdx,RDX-ARGOFFSET - CFI_REL_OFFSET rsi,RSI-ARGOFFSET - CFI_REL_OFFSET rdi,RDI-ARGOFFSET - CFI_REL_OFFSET rip,RIP-ARGOFFSET -/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ -/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP-ARGOFFSET -/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ + CFI_DEF_CFA rsp,SS+8 + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP +/* CFI_REL_OFFSET cs,CS*/ +/* CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP +/* CFI_REL_OFFSET ss,SS*/ SAVE_EXTRA_REGS 8 call *%rax RESTORE_EXTRA_REGS 8 diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 38356476b131..4a7ceb9789a5 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -88,8 +88,6 @@ For 32-bit we have the following conventions - kernel is built with #define RSP 19*8 #define SS 20*8 -#define ARGOFFSET 0 - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 subq $15*8+\addskip, %rsp CFI_ADJUST_CFA_OFFSET 15*8+\addskip diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8fafed9f462d..06055f9578a8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -73,9 +73,9 @@ ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +.macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: @@ -107,8 +107,8 @@ ENDPROC(native_usergs_sysret64) call debug_stack_reset .endm -.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -184,16 +184,16 @@ ENDPROC(native_usergs_sysret64) * frame that enables passing a complete pt_regs to a C function. */ .macro DEFAULT_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET - CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET - CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET - CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET - CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET - CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET - CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET - CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET - CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET - CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + XCPT_FRAME \start, ORIG_RAX+\offset + CFI_REL_OFFSET rdi, RDI+\offset + CFI_REL_OFFSET rsi, RSI+\offset + CFI_REL_OFFSET rdx, RDX+\offset + CFI_REL_OFFSET rcx, RCX+\offset + CFI_REL_OFFSET rax, RAX+\offset + CFI_REL_OFFSET r8, R8+\offset + CFI_REL_OFFSET r9, R9+\offset + CFI_REL_OFFSET r10, R10+\offset + CFI_REL_OFFSET r11, R11+\offset CFI_REL_OFFSET rbx, RBX+\offset CFI_REL_OFFSET rbp, RBP+\offset CFI_REL_OFFSET r12, R12+\offset @@ -237,13 +237,13 @@ ENTRY(ret_from_fork) RESTORE_EXTRA_REGS - testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? + testl $3,CS(%rsp) # from kernel_thread? jz 1f testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call - RESTORE_TOP_OF_STACK %rdi, -ARGOFFSET + RESTORE_TOP_OF_STACK %rdi jmp ret_from_sys_call # go to the SYSRET fastpath 1: @@ -310,11 +310,11 @@ GLOBAL(system_call_after_swapgs) ENABLE_INTERRUPTS(CLBR_NONE) ALLOC_PT_GPREGS_ON_STACK 8 SAVE_C_REGS_EXCEPT_RAX_RCX - movq $-ENOSYS,RAX-ARGOFFSET(%rsp) - movq_cfi rax,(ORIG_RAX-ARGOFFSET) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + movq $-ENOSYS,RAX(%rsp) + movq_cfi rax,ORIG_RAX + movq %rcx,RIP(%rsp) + CFI_REL_OFFSET rip,RIP + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -326,13 +326,13 @@ system_call_fastpath: ja ret_from_sys_call /* and return regs->ax */ movq %r10,%rcx call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) /* * Syscall return path ending with SYSRET (fast path) * Has incomplete stack frame and undefined top of stack. */ ret_from_sys_call: - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) jnz int_ret_from_sys_call_fixup /* Go the the slow path */ LOCKDEP_SYS_EXIT @@ -344,7 +344,7 @@ ret_from_sys_call: */ TRACE_IRQS_ON RESTORE_C_REGS_EXCEPT_RCX - movq RIP-ARGOFFSET(%rsp),%rcx + movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ movq PER_CPU_VAR(old_rsp), %rsp @@ -353,7 +353,7 @@ ret_from_sys_call: CFI_RESTORE_STATE int_ret_from_sys_call_fixup: - FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + FIXUP_TOP_OF_STACK %r11 jmp int_ret_from_sys_call /* Do syscall tracing */ @@ -364,7 +364,7 @@ tracesys: test %rax, %rax jnz tracesys_phase2 /* if needed, run the slow path */ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ - movq ORIG_RAX-ARGOFFSET(%rsp), %rax + movq ORIG_RAX(%rsp), %rax jmp system_call_fastpath /* and return to the fast path */ tracesys_phase2: @@ -391,7 +391,7 @@ tracesys_phase2: ja int_ret_from_sys_call /* RAX(%rsp) is already set */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) /* Use IRET because user could have changed frame */ /* @@ -475,9 +475,9 @@ END(stub_\func) ENTRY(\label) CFI_STARTPROC DEFAULT_FRAME 0, 8 /* offset 8: return address */ - FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET + FIXUP_TOP_OF_STACK %r11, 8 call \func - RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET + RESTORE_TOP_OF_STACK %r11, 8 ret CFI_ENDPROC END(\label) @@ -677,7 +677,7 @@ common_interrupt: ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ - /* 0(%rsp): old_rsp-ARGOFFSET */ + /* 0(%rsp): old_rsp */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -687,13 +687,13 @@ ret_from_intr: popq %rsi CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq ARGOFFSET-RBP(%rsi), %rsp + leaq -RBP(%rsi),%rsp CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET + CFI_ADJUST_CFA_OFFSET RBP exit_intr: GET_THREAD_INFO(%rcx) - testl $3,CS-ARGOFFSET(%rsp) + testl $3,CS(%rsp) je retint_kernel /* Interrupt came from user space */ @@ -721,8 +721,8 @@ retint_swapgs: /* return to user-space */ * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq (RCX-ARGOFFSET)(%rsp), %rcx - cmpq %rcx,(RIP-ARGOFFSET)(%rsp) /* RCX == RIP */ + movq RCX(%rsp),%rcx + cmpq %rcx,RIP(%rsp) /* RCX == RIP */ jne opportunistic_sysret_failed /* @@ -743,19 +743,19 @@ retint_swapgs: /* return to user-space */ shr $__VIRTUAL_MASK_SHIFT, %rcx jnz opportunistic_sysret_failed - cmpq $__USER_CS,(CS-ARGOFFSET)(%rsp) /* CS must match SYSRET */ + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed - movq (R11-ARGOFFSET)(%rsp), %r11 - cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ jne opportunistic_sysret_failed - testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ + testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ jnz opportunistic_sysret_failed /* nothing to check for RSP */ - cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ jne opportunistic_sysret_failed /* @@ -870,7 +870,7 @@ retint_signal: ENTRY(retint_kernel) cmpl $0,PER_CPU_VAR(__preempt_count) jnz retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq jmp exit_intr @@ -1286,11 +1286,11 @@ ENTRY(paranoid_exit) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs - TRACE_IRQS_IRETQ 0 + TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK jmp paranoid_exit_restore paranoid_exit_no_swapgs: - TRACE_IRQS_IRETQ_DEBUG 0 + TRACE_IRQS_IRETQ_DEBUG paranoid_exit_restore: RESTORE_EXTRA_REGS RESTORE_C_REGS From 050273d19b94f2adf9d35979cee949d6b6a9df84 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 26 Feb 2015 14:40:31 -0800 Subject: [PATCH 039/148] x86/asm/entry/64: Remove 'int_check_syscall_exit_work' Nothing references it anymore. Reported-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Fixes: 96b6352c1271 ("x86_64, entry: Remove the syscall exit audit and schedule optimizations") Link: http://lkml.kernel.org/r/dd2a4d26ecc7a5db61b476727175cd99ae2b32a4.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 06055f9578a8..b0fbde1876e1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -431,7 +431,6 @@ int_careful: int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) -int_check_syscall_exit_work: SAVE_EXTRA_REGS /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx From b87cf63e2a5fbe3b368d5f5e5708e585b0fb3f84 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:32 -0800 Subject: [PATCH 040/148] x86/asm/entry: Add comments about various syscall instructions SYSCALL/SYSRET and SYSENTER/SYSEXIT have weird semantics. Moreover, they differ in 32- and 64-bit mode. What is saved? What is not? Is rsp set? Are interrupts disabled? People tend to not remember these details well enough. This patch adds comments which explain in detail what registers are modified by each of these instructions. The comments are placed immediately before corresponding entry and exit points. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/a94b98b63527797c871a81402ff5060b18fa880a.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 133 +++++++++++++++++++++++-------------- arch/x86/kernel/entry_64.S | 32 +++++---- 2 files changed, 102 insertions(+), 63 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index e99f8a5be2df..b5670564a1fb 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -99,22 +99,25 @@ ENDPROC(native_irq_enable_sysexit) /* * 32bit SYSENTER instruction entry. * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp user stack - * 0(%ebp) Arg6 - * - * Interrupts off. - * + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code + * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. - */ + */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME @@ -128,6 +131,7 @@ ENTRY(ia32_sysenter_target) * disabled irqs, here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) + /* Construct iret frame (ss,rsp,rflags,cs,rip) */ movl %ebp,%ebp /* zero extension */ pushq_cfi $__USER32_DS /*CFI_REL_OFFSET ss,0*/ @@ -140,14 +144,19 @@ ENTRY(ia32_sysenter_target) pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ movl %eax, %eax + /* Store thread_info->sysenter_return in rip stack slot */ pushq_cfi %r10 CFI_REL_OFFSET rip,0 + /* Store orig_ax */ pushq_cfi %rax + /* Construct the rest of "struct pt_regs" */ cld ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS_EXCEPT_R891011 - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ ASM_STAC 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) @@ -184,6 +193,7 @@ sysexit_from_sys_call: movl RIP(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx RESTORE_RSI_RDI + /* pop everything except ss,rsp,rflags slots */ REMOVE_PT_GPREGS_FROM_STACK 3*8 xorq %r8,%r8 xorq %r9,%r9 @@ -194,6 +204,10 @@ sysexit_from_sys_call: popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON + /* + * 32bit SYSEXIT restores eip from edx, esp from ecx. + * cs and ss are loaded from MSRs. + */ ENABLE_INTERRUPTS_SYSEXIT32 CFI_RESTORE_STATE @@ -274,23 +288,33 @@ ENDPROC(ia32_sysenter_target) /* * 32bit SYSCALL instruction entry. * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx return EIP - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg2 [note: not saved in the stack frame, should not be touched] - * %esp user stack - * 0(%esp) Arg6 - * - * Interrupts off. - * + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME @@ -306,7 +330,7 @@ ENTRY(ia32_cstar_target) * disabled irqs and here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) - ALLOC_PT_GPREGS_ON_STACK 8 + ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ SAVE_C_REGS_EXCEPT_RCX_R891011 movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX(%rsp) @@ -320,9 +344,11 @@ ENTRY(ia32_cstar_target) /*CFI_REL_OFFSET rflags,EFLAGS*/ movq %r8,RSP(%rsp) CFI_REL_OFFSET rsp,RSP - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ - /* hardware stack frame is complete now */ + /* iret stack frame is complete now */ + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ ASM_STAC 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) @@ -355,8 +381,15 @@ sysretl_from_sys_call: TRACE_IRQS_ON movl RSP(%rsp),%esp CFI_RESTORE rsp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + */ USERGS_SYSRET32 - + #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: CFI_RESTORE_STATE @@ -394,26 +427,26 @@ ia32_badarg: jmp ia32_sysret CFI_ENDPROC -/* - * Emulated IA32 system calls via int 0x80. +/* + * Emulated IA32 system calls via int 0x80. * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) * * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except %eax must be saved (but ptrace may violate that) + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). * Arguments are zero extended. For system calls that want sign extension and * take long arguments a wrapper is needed. Most calls can just be called * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ + * Assumes it is only called from user space and entered with interrupts off. + */ ENTRY(ia32_syscall) CFI_STARTPROC32 simple @@ -432,7 +465,7 @@ ENTRY(ia32_syscall) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%eax - pushq_cfi %rax + pushq_cfi %rax /* store orig_ax */ cld /* note the registers are not zero extended to the sf. this could be a problem. */ diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b0fbde1876e1..e5cbfbbf9479 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -256,25 +256,25 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* - * System call entry. Up to 6 arguments in registers are supported. + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. However, it does mask the flags register for us, so - * CLD and CLAC are not needed. - */ - -/* - * Register setup: + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Registers on entry: * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) * rdi arg0 - * rcx return address for syscall/sysret, C arg3 * rsi arg1 * rdx arg2 - * r10 arg3 (--> moved to rcx for C) + * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) * * Interrupts are off on entry. * Only called from user space. @@ -302,13 +302,14 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(old_rsp) + /* kernel_stack is set so that 5 slots (iret frame) are preallocated */ movq PER_CPU_VAR(kernel_stack),%rsp /* * No need to follow this irqs off/on section - it's straight * and short: */ ENABLE_INTERRUPTS(CLBR_NONE) - ALLOC_PT_GPREGS_ON_STACK 8 + ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ SAVE_C_REGS_EXCEPT_RAX_RCX movq $-ENOSYS,RAX(%rsp) movq_cfi rax,ORIG_RAX @@ -348,6 +349,11 @@ ret_from_sys_call: CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ movq PER_CPU_VAR(old_rsp), %rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + */ USERGS_SYSRET64 CFI_RESTORE_STATE From 1eeb207f870f746a863e5c59321d837d2d91c218 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:33 -0800 Subject: [PATCH 041/148] x86/asm/entry/64: Move 'save_paranoid' and 'ret_from_fork' closer to their users For some odd reason, these two functions are at the very top of the file. "save_paranoid"'s caller is approximately in the middle of it, move it there. Move 'ret_from_fork' to be right after fork/exec helpers. This is a pure block move, nothing is changed in the function bodies. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/6446bbfe4094532623a5b83779b7015fec167a9d.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 106 ++++++++++++++++++------------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e5cbfbbf9479..9e33d492ace3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -202,59 +202,6 @@ ENDPROC(native_usergs_sysret64) CFI_REL_OFFSET r15, R15+\offset .endm -ENTRY(save_paranoid) - XCPT_FRAME 1 RDI+8 - cld - SAVE_C_REGS 8 - SAVE_EXTRA_REGS 8 - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret - CFI_ENDPROC -END(save_paranoid) - -/* - * A newly forked process directly context switches into this address. - * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - DEFAULT_FRAME - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - GET_THREAD_INFO(%rcx) - - RESTORE_EXTRA_REGS - - testl $3,CS(%rsp) # from kernel_thread? - jz 1f - - testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET - jnz int_ret_from_sys_call - - RESTORE_TOP_OF_STACK %rdi - jmp ret_from_sys_call # go to the SYSRET fastpath - -1: - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call - CFI_ENDPROC -END(ret_from_fork) - /* * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * @@ -581,6 +528,43 @@ END(stub_x32_execveat) #endif +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq_cfi $0x0002 + popfq_cfi # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + GET_THREAD_INFO(%rcx) + + RESTORE_EXTRA_REGS + + testl $3,CS(%rsp) # from kernel_thread? + jz 1f + + testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET + jnz int_ret_from_sys_call + + RESTORE_TOP_OF_STACK %rdi + jmp ret_from_sys_call # go to the SYSRET fastpath + +1: + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS + jmp int_ret_from_sys_call + CFI_ENDPROC +END(ret_from_fork) + /* * Build the entry stubs and pointer table with some assembler magic. * We pack 7 stubs into a single 32-byte chunk, which will fit in a @@ -1273,6 +1257,22 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif +ENTRY(save_paranoid) + XCPT_FRAME 1 RDI+8 + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(save_paranoid) + /* * "Paranoid" exit path from exception stack. This is invoked * only on return from non-NMI IST interrupts that came From ebfc453e27c676e104378366a0b027e5c6918631 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:34 -0800 Subject: [PATCH 042/148] x86/asm/entry/64: Clean up and document various entry code details This patch does a lot of cleanup in comments and formatting, but it does not change any code: - Rename 'save_paranoid' to 'paranoid_entry': this makes naming similar to its "non-paranoid" sibling, 'error_entry', and to its counterpart, 'paranoid_exit'. - Use the same CFI annotation atop 'paranoid_entry' and 'error_entry'. - Fix irregular indentation of assembler operands. - Add/fix comments on top of 'paranoid_entry' and 'error_entry'. - Remove stale comment about "oldrax". - Make comments about "no swapgs" flag in ebx more prominent. - Deindent wrongly indented top-level comment atop 'paranoid_exit'. - Indent wrongly deindented comment inside 'error_entry'. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/4640f9fcd5ea46eb299b1cd6d3f5da3167d2f78d.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 68 ++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9e33d492ace3..466947770648 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -982,10 +982,11 @@ ENTRY(\sym) testl $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif - call save_paranoid + call paranoid_entry .else call error_entry .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ DEFAULT_FRAME 0 @@ -1016,10 +1017,11 @@ ENTRY(\sym) addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit /* %ebx: no swapgs flag */ + jmp paranoid_exit .else - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif .if \paranoid == 1 @@ -1257,8 +1259,13 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif -ENTRY(save_paranoid) - XCPT_FRAME 1 RDI+8 +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1271,20 +1278,19 @@ ENTRY(save_paranoid) xorl %ebx,%ebx 1: ret CFI_ENDPROC -END(save_paranoid) +END(paranoid_entry) - /* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ - - /* ebx: no swapgs flag */ +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) @@ -1305,13 +1311,11 @@ paranoid_exit_restore: END(paranoid_exit) /* - * Exception entry point. This expects an error code/orig_rax on the stack. - * returns in "no swapgs flag" in %ebx. + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME - CFI_ADJUST_CFA_OFFSET 15*8 - /* oldrax contains error code */ + XCPT_FRAME 1 15*8 cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 @@ -1324,12 +1328,12 @@ error_sti: TRACE_IRQS_OFF ret -/* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx @@ -1359,7 +1363,7 @@ error_bad_iret: END(error_entry) -/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) DEFAULT_FRAME movl %ebx,%eax @@ -1585,13 +1589,13 @@ end_repeat_nmi: ALLOC_PT_GPREGS_ON_STACK /* - * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit * as we should not be calling schedule in NMI context. * Even with normal interrupts enabled. An NMI should not be * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call save_paranoid + call paranoid_entry DEFAULT_FRAME 0 /* From 14f6e9532dda399a7b789f744dc045f8865a9e42 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:35 -0800 Subject: [PATCH 043/148] x86/asm/entry/64/compat: Fold the IA32_ARG_FIXUP macro into its callers Use of a small macro - one with conditional expansion - does more harm than good. It obfuscates code, with minimal code reuse. For example, because of obfuscation it's not obvious that in 'ia32_sysenter_target', we can optimize loading of r9 - currently it is loaded with a detour through ebp. This patch folds the IA32_ARG_FIXUP macro into its callers. No code changes. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/4da092094cd78734384ac31e0d4ec1d8f69145a2.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index b5670564a1fb..6dcd37256979 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -30,17 +30,6 @@ .section .entry.text, "ax" - .macro IA32_ARG_FIXUP noebp=0 - movl %edi,%r8d - .if \noebp - .else - movl %ebp,%r9d - .endif - xchg %ecx,%esi - movl %ebx,%edi - movl %edx,%edx /* zero extension */ - .endm - /* clobbers %rax */ .macro CLEAR_RREGS _r9=rax xorl %eax,%eax @@ -178,7 +167,12 @@ sysenter_flags_fixed: cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: - IA32_ARG_FIXUP + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX(%rsp) @@ -360,7 +354,12 @@ ENTRY(ia32_cstar_target) cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: - IA32_ARG_FIXUP 1 + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ cstar_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX(%rsp) @@ -477,7 +476,12 @@ ENTRY(ia32_syscall) cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: - IA32_ARG_FIXUP + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: movq %rax,RAX(%rsp) From 911d2bb5ccaab102abbab2bb58438c75bc342ca9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:36 -0800 Subject: [PATCH 044/148] x86/asm/entry/64: Use more readable constants Constants such as SS+8 or SS+8-RIP are mysterious. In most cases, SS+8 is just meant to be SIZEOF_PTREGS, SS+8-RIP is RIP's offset in the iret frame. This patch changes some of these constants to be less mysterious. No code changes (verified with objdump). Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1d20491384773bd606e23a382fac23ddb49b5178.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 2 ++ arch/x86/kernel/entry_64.S | 28 ++++++++++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 4a7ceb9789a5..337423590b08 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -88,6 +88,8 @@ For 32-bit we have the following conventions - kernel is built with #define RSP 19*8 #define SS 20*8 +#define SIZEOF_PTREGS 21*8 + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 subq $15*8+\addskip, %rsp CFI_ADJUST_CFA_OFFSET 15*8+\addskip diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 466947770648..858e94e86f5e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -164,12 +164,12 @@ ENDPROC(native_usergs_sysret64) * initial frame state for interrupts (and exceptions without error code) */ .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, SS+8+\offset-RIP - /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ - CFI_REL_OFFSET rsp, RSP+\offset-RIP - /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ - /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ - CFI_REL_OFFSET rip, RIP+\offset-RIP + EMPTY_FRAME \start, 5*8+\offset + /*CFI_REL_OFFSET ss, 4*8+\offset*/ + CFI_REL_OFFSET rsp, 3*8+\offset + /*CFI_REL_OFFSET rflags, 2*8+\offset*/ + /*CFI_REL_OFFSET cs, 1*8+\offset*/ + CFI_REL_OFFSET rip, 0*8+\offset .endm /* @@ -177,7 +177,7 @@ ENDPROC(native_usergs_sysret64) * with vector already pushed) */ .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, RIP+\offset-ORIG_RAX + INTR_FRAME \start, 1*8+\offset .endm /* @@ -645,10 +645,14 @@ END(interrupt) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi pushq %rsi + /* + * For debugger: + * "CFA (Current Frame Address) is the value on stack + offset" + */ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 */, 0, \ + 0x77 /* DW_OP_breg7 (rsp) */, 0, \ 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -674,7 +678,7 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ + CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ /* return code expects complete pt_regs - adjust rsp accordingly: */ leaq -RBP(%rsi),%rsp CFI_DEF_CFA_REGISTER rsp @@ -1549,7 +1553,7 @@ first_nmi: .rept 5 pushq_cfi 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1577,7 +1581,7 @@ repeat_nmi: pushq_cfi -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* From b3ab90b333e94659e7c351843ab41ec0004f73e8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:37 -0800 Subject: [PATCH 045/148] x86/asm/entry/64/compat: Use more readable constant The last instance of "mysterious" SS+8 constant is replaced by SIZEOF_PTREGS. Message-Id: <1424822419-10267-1-git-send-email-dvlasenk@redhat.com> Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/d35aeba3059407ac54f472ddcfbea767ff8916ac.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 6dcd37256979..ed9746340363 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -536,7 +536,7 @@ ia32_ptregs_common: CFI_ENDPROC CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8 + CFI_DEF_CFA rsp,SIZEOF_PTREGS CFI_REL_OFFSET rax,RAX CFI_REL_OFFSET rcx,RCX CFI_REL_OFFSET rdx,RDX From d441c1f2b73ec742c2e55be804ebc6fee130c77f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 26 Feb 2015 14:40:38 -0800 Subject: [PATCH 046/148] x86/asm/entry/64: Simplify optimistic SYSRET Avoid redundant load of %r11 (it is already loaded a few instructions before). Also simplify %rsp restoration, instead of two steps: add $0x80, %rsp mov 0x18(%rsp), %rsp we can do a simplified single step to restore user-space RSP: mov 0x98(%rsp), %rsp and get the same result. Signed-off-by: Denys Vlasenko Signed-off-by: Andy Lutomirski [ Clarified the changelog. ] Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1aef69b346a6db0d99cdfb0f5ba83e8c985e27d7.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 3 +++ arch/x86/kernel/entry_64.S | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 337423590b08..f1a962ff7ddf 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -176,6 +176,9 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_EXCEPT_RCX RESTORE_C_REGS_HELPER 1,0,1,1,1 .endm + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm .macro RESTORE_RSI_RDI RESTORE_C_REGS_HELPER 0,0,0,0,0 .endm diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 858e94e86f5e..bc1527889c40 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -757,9 +757,9 @@ retint_swapgs: /* return to user-space */ */ irq_return_via_sysret: CFI_REMEMBER_STATE - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - movq (RSP-RIP)(%rsp),%rsp + /* r11 is already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_R11 + movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE From 1e3fbb8a1d814f35e2e689cf87714d38d9f3564d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 26 Feb 2015 14:40:39 -0800 Subject: [PATCH 047/148] x86/asm/entry/64: Remove a bogus 'ret_from_fork' optimization 'ret_from_fork' checks TIF_IA32 to determine whether 'pt_regs' and the related state make sense for 'ret_from_sys_call'. This is entirely the wrong check. TS_COMPAT would make a little more sense, but there's really no point in keeping this optimization at all. This fixes a return to the wrong user CS if we came from int 0x80 in a 64-bit task. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4710be56d76ef994ddf59087aad98c000fbab9a4.1424989793.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bc1527889c40..622ce4254893 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -550,11 +550,14 @@ ENTRY(ret_from_fork) testl $3,CS(%rsp) # from kernel_thread? jz 1f - testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET - jnz int_ret_from_sys_call - - RESTORE_TOP_OF_STACK %rdi - jmp ret_from_sys_call # go to the SYSRET fastpath + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use int_ret_from_sys_call to return, since it can safely handle + * all of the above. + */ + jmp int_ret_from_sys_call 1: movq %rbp, %rdi From 5eca7453d61003bf886992388f8cb407e6f0d051 Mon Sep 17 00:00:00 2001 From: Wang Nan Date: Fri, 27 Feb 2015 12:19:49 +0800 Subject: [PATCH 048/148] x86/traps: Separate set_intr_gate() and clean up early_trap_init() As early_trap_init() doesn't use IST, replace set_intr_gate_ist() and set_system_intr_gate_ist() with their standard counterparts. set_intr_gate() requires a trace_debug symbol which we don't have and won't use. This patch separates set_intr_gate() into two parts, and uses base version in early_trap_init(). Reported-by: Andy Lutomirski Signed-off-by: Wang Nan Acked-by: Andy Lutomirski Cc: Cc: Cc: Cc: Cc: Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1425010789-13714-1-git-send-email-wangnan0@huawei.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/desc.h | 7 ++++++- arch/x86/kernel/traps.c | 20 ++++++++++++-------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a94b82e8f156..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ -#define set_intr_gate(n, addr) \ +#define set_intr_gate_notrace(n, addr) \ do { \ BUG_ON((unsigned)n > 0xFF); \ _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ __KERNEL_CS); \ + } while (0) + +#define set_intr_gate(n, addr) \ + do { \ + set_intr_gate_notrace(n, addr); \ _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 0, 0, __KERNEL_CS); \ } while (0) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 42819886be0c..9965bd1916db 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -926,16 +926,20 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) void __init early_trap_init(void) { /* - * Don't set ist to DEBUG_STACK as it doesn't work until TSS is - * ready in cpu_init() <-- trap_init(). Before trap_init(), CPU - * runs at ring 0 so it is impossible to hit an invalid stack. - * Using the original stack works well enough at this early - * stage. DEBUG_STACK will be equipped after cpu_init() in + * Don't use IST to set DEBUG_STACK as it doesn't work until TSS + * is ready in cpu_init() <-- trap_init(). Before trap_init(), + * CPU runs at ring 0 so it is impossible to hit an invalid + * stack. Using the original stack works well enough at this + * early stage. DEBUG_STACK will be equipped after cpu_init() in * trap_init(). + * + * We don't need to set trace_idt_table like set_intr_gate(), + * since we don't have trace_debug and it will be reset to + * 'debug' in trap_init() by set_intr_gate_ist(). */ - set_intr_gate_ist(X86_TRAP_DB, &debug, 0); + set_intr_gate_notrace(X86_TRAP_DB, debug); /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, 0); + set_system_intr_gate(X86_TRAP_BP, &int3); #ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, page_fault); #endif @@ -1015,7 +1019,7 @@ void __init trap_init(void) /* * X86_TRAP_DB and X86_TRAP_BP have been set - * in early_trap_init(). However, DEBUG_STACK works only after + * in early_trap_init(). However, ITS works only after * cpu_init() loads TSS. See comments in early_trap_init(). */ set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); From 8ef46a672a7d852709561d10672b6eaa8a4acd82 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:02 -0800 Subject: [PATCH 049/148] x86/asm/entry: Add this_cpu_sp0() to read sp0 for the current cpu We currently store references to the top of the kernel stack in multiple places: kernel_stack (with an offset) and init_tss.x86_tss.sp0 (no offset). The latter is defined by hardware and is a clean canonical way to find the top of the stack. Add an accessor so we can start using it. This needs minor paravirt tweaks. On native, sp0 defines the top of the kernel stack and is therefore always correct. On Xen and lguest, the hypervisor tracks the top of the stack, but we want to start reading sp0 in the kernel. Fixing this is simple: just update our local copy of sp0 as well as the hypervisor's copy on task switches. Signed-off-by: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Rusty Russell Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8d675581859712bee09a055ed8f785d80dac1eca.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 5 +++++ arch/x86/kernel/process.c | 1 + arch/x86/lguest/boot.c | 1 + arch/x86/xen/enlighten.c | 1 + 4 files changed, 8 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 7be2c9a6caba..71c3a826a690 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -564,6 +564,11 @@ static inline void native_swapgs(void) #endif } +static inline unsigned long this_cpu_sp0(void) +{ + return this_cpu_read_stable(init_tss.x86_tss.sp0); +} + #ifdef CONFIG_PARAVIRT #include #else diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..ff5c9088b1c5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -38,6 +38,7 @@ * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +EXPORT_PER_CPU_SYMBOL_GPL(init_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..8561585ee2c6 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1076,6 +1076,7 @@ static void lguest_load_sp0(struct tss_struct *tss, { lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, THREAD_SIZE / PAGE_SIZE); + tss->x86_tss.sp0 = thread->sp0; } /* Let's just say, I wouldn't do debugging under a Guest. */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5240f563076d..81665c9f2132 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss, mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); + tss->x86_tss.sp0 = thread->sp0; } static void xen_set_iopl_mask(unsigned mask) From 75182b1632a89f12540baa1806a7c5c180db620c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:03 -0800 Subject: [PATCH 050/148] x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0() This will make modifying the semantics of kernel_stack easier. The change to ist_begin_non_atomic() is necessary because sp0 no longer points to the same THREAD_SIZE-aligned region as RSP; it's one byte too high for that. At Denys' suggestion, rather than offsetting it, just check explicitly that we're in the correct range ending at sp0. This has the added benefit that we no longer assume that the thread stack is aligned to THREAD_SIZE. Suggested-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ef8254ad414cbb8034c9a56396eeb24f5dd5b0de.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 3 +-- arch/x86/kernel/traps.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..a2fa1899494e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -159,8 +159,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { struct thread_info *ti; - ti = (void *)(this_cpu_read_stable(kernel_stack) + - KERNEL_STACK_OFFSET - THREAD_SIZE); + ti = (void *)(this_cpu_sp0() - THREAD_SIZE); return ti; } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9965bd1916db..fa290586ed37 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) - & ~(THREAD_SIZE - 1)) != 0); + BUG_ON((unsigned long)(this_cpu_sp0() - current_stack_pointer()) >= + THREAD_SIZE); preempt_count_sub(HARDIRQ_OFFSET); } From 9d0c914c60f4d3123debb653340dc1f7cf44939d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:04 -0800 Subject: [PATCH 051/148] x86/asm/entry/64/compat: Change the 32-bit sysenter code to use sp0 The ia32 sysenter code loaded the top of the kernel stack into rsp by loading kernel_stack and then adjusting it. It can be simplified to just read sp0 directly. This requires the addition of a new asm-offsets entry for sp0. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/88ff9006163d296a0665338585c36d9bfb85235d.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 3 +-- arch/x86/kernel/asm-offsets_64.c | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ed9746340363..719db63b35c4 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -113,8 +113,7 @@ ENTRY(ia32_sysenter_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(kernel_stack), %rsp - addq $(KERNEL_STACK_OFFSET),%rsp + movq PER_CPU_VAR(init_tss + TSS_sp0), %rsp /* * No need to follow this irqs on/off section: the syscall * disabled irqs, here we enable it straight after entry: diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d27c9f..5ce6f2da8763 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -81,6 +81,7 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); From 24933b82c0d9a711475a5ef7904eb733f561e637 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:05 -0800 Subject: [PATCH 052/148] x86/asm/entry: Rename 'init_tss' to 'cpu_tss' It has nothing to do with init -- there's only one TSS per cpu. Other names considered include: - current_tss: Confusing because we never switch the tss. - singleton_tss: Too long. This patch was generated with 's/init_tss/cpu_tss/g'. Followup patches will fix INIT_TSS and INIT_TSS_IST by hand. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/da29fb2a793e4f649d93ce2d1ed320ebe8516262.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 +- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/cpu/common.c | 6 +++--- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/ioport.c | 2 +- arch/x86/kernel/process.c | 6 +++--- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vm86_32.c | 4 ++-- arch/x86/power/cpu.c | 2 +- 10 files changed, 16 insertions(+), 16 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 719db63b35c4..ad9efef65a6b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -113,7 +113,7 @@ ENTRY(ia32_sysenter_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(init_tss + TSS_sp0), %rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp /* * No need to follow this irqs on/off section: the syscall * disabled irqs, here we enable it straight after entry: diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 71c3a826a690..117ee65473e2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -282,7 +282,7 @@ struct tss_struct { } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); /* * Save the original ist values for checking stack pointers during debugging @@ -566,7 +566,7 @@ static inline void native_swapgs(void) static inline unsigned long this_cpu_sp0(void) { - return this_cpu_read_stable(init_tss.x86_tss.sp0); + return this_cpu_read_stable(cpu_tss.x86_tss.sp0); } #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2346c95c6ab1..5d0f0cc7ea26 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -979,7 +979,7 @@ static void syscall32_cpu_init(void) void enable_sep_cpu(void) { int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); if (!boot_cpu_has(X86_FEATURE_SEP)) { put_cpu(); @@ -1307,7 +1307,7 @@ void cpu_init(void) */ load_ucode_ap(); - t = &per_cpu(init_tss, cpu); + t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1391,7 +1391,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 622ce4254893..0c00fd80249a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -959,7 +959,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) +#define INIT_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff5c9088b1c5..6f6087349231 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,8 +37,8 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; -EXPORT_PER_CPU_SYMBOL_GPL(init_tss); +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = INIT_TSS; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -110,7 +110,7 @@ void exit_thread(void) unsigned long *bp = t->io_bitmap_ptr; if (bp) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..d3460af3d27a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -248,7 +248,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 854b5981b327..2cd562f96c1f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -277,7 +277,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; fpu_switch_t fpu; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); @@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3e32ed5648a0..757678fb26e1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -134,7 +134,7 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_table(cpu); tss_desc tss; From d0a0de21f82bbc1737ea3c831f018d0c2bc6b9c2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:06 -0800 Subject: [PATCH 053/148] x86/asm/entry: Remove INIT_TSS and fold the definitions into 'cpu_tss' The INIT_TSS is unnecessary. Just define the initial TSS where 'cpu_tss' is defined. While we're at it, merge the 32-bit and 64-bit definitions. The only syntactic change is that 32-bit kernels were computing sp0 as long, but now they compute it as unsigned long. Verified by objdump: the contents and relocations of .data..percpu..shared_aligned are unchanged on 32-bit and 64-bit kernels. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8fc39fa3f6c5d635e93afbdd1a0fe0678a6d7913.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 20 -------------------- arch/x86/kernel/process.c | 20 +++++++++++++++++++- 2 files changed, 19 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 117ee65473e2..f5e3ec63767d 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -818,22 +818,6 @@ static inline void spin_lock_prefetch(const void *x) .io_bitmap_ptr = NULL, \ } -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); #define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) @@ -892,10 +876,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ } -#define INIT_TSS { \ - .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - /* * Return saved PC of a blocked thread. * What is this good for? it will be always the scheduler or ret_from_fork. diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 6f6087349231..f4c0af7fc3a0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,7 +37,25 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = (unsigned long)&init_stack + sizeof(init_stack), +#ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +#endif + }, +#ifdef CONFIG_X86_32 + /* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, +#endif +}; EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 From 9b47668843d800ed57f6f6bfd6f5c4cffdf201c6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 5 Mar 2015 19:19:07 -0800 Subject: [PATCH 054/148] x86/asm/entry: Rename 'INIT_TSS_IST' to 'CPU_TSS_IST' This has nothing to do with the init thread or the initial anything. It's just the CPU's TSS. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0bd5e26b32a2e1f08ff99017d0997118fbb2485.1425611534.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0c00fd80249a..5117a2baefe9 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -959,7 +959,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -#define INIT_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) @@ -1015,13 +1015,13 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif /* these procedures expect "no swapgs" flag in ebx */ From b27559a433bb6080d95c2593d4a2b81401197911 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 6 Mar 2015 17:50:18 -0800 Subject: [PATCH 055/148] x86/asm/entry: Delay loading sp0 slightly on task switch The change: 75182b1632a8 ("x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0()") had the unintended side effect of changing the return value of current_thread_info() during part of the context switch process. Change it back. This has no effect as far as I can tell -- it's just for consistency. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9fcaa47dd8487db59eed7a3911b6ae409476763e.1425692936.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 10 +++++----- arch/x86/kernel/process_64.c | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d3460af3d27a..0405cab6634d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -255,11 +255,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0. - */ - load_sp0(tss, next); - /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are @@ -310,6 +305,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); + /* + * Reload esp0. This changes current_thread_info(). + */ + load_sp0(tss, next); + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 2cd562f96c1f..1e393d27d701 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -283,9 +283,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* Reload esp0 and ss1. */ - load_sp0(tss, next); - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -413,6 +410,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + /* Reload esp0 and ss1. This changes current_thread_info(). */ + load_sp0(tss, next); + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); From a7fcf28d431ef70afaa91496e64e16dc51dccec4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 6 Mar 2015 17:50:19 -0800 Subject: [PATCH 056/148] x86/asm/entry: Replace this_cpu_sp0() with current_top_of_stack() and fix it on x86_32 I broke 32-bit kernels. The implementation of sp0 was correct as far as I can tell, but sp0 was much weirder on x86_32 than I realized. It has the following issues: - Init's sp0 is inconsistent with everything else's: non-init tasks are offset by 8 bytes. (I have no idea why, and the comment is unhelpful.) - vm86 does crazy things to sp0. Fix it up by replacing this_cpu_sp0() with current_top_of_stack() and using a new percpu variable to track the top of the stack on x86_32. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Fixes: 75182b1632a8 ("x86/asm/entry: Switch all C consumers of kernel_stack to this_cpu_sp0()") Link: http://lkml.kernel.org/r/d09dbe270883433776e0cbee3c7079433349e96d.1425692936.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 11 ++++++++++- arch/x86/include/asm/thread_info.h | 4 +--- arch/x86/kernel/cpu/common.c | 13 +++++++++++-- arch/x86/kernel/process_32.c | 11 +++++++---- arch/x86/kernel/smpboot.c | 2 ++ arch/x86/kernel/traps.c | 4 ++-- 6 files changed, 33 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f5e3ec63767d..48a61c1c626e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -284,6 +284,10 @@ struct tss_struct { DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#endif + /* * Save the original ist values for checking stack pointers during debugging */ @@ -564,9 +568,14 @@ static inline void native_swapgs(void) #endif } -static inline unsigned long this_cpu_sp0(void) +static inline unsigned long current_top_of_stack(void) { +#ifdef CONFIG_X86_64 return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +#else + /* sp0 on x86_32 is special in and around vm86 mode. */ + return this_cpu_read_stable(cpu_current_top_of_stack); +#endif } #ifdef CONFIG_PARAVIRT diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a2fa1899494e..7740edd56fed 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -158,9 +158,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_sp0() - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } static inline unsigned long current_stack_pointer(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 5d0f0cc7ea26..76348334b934 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1130,8 +1130,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; /* - * The following four percpu variables are hot. Align current_task to - * cacheline size such that all four fall in the same cacheline. + * The following percpu variables are hot. Align current_task to + * cacheline size such that they fall in the same cacheline. */ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; @@ -1226,6 +1226,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +/* + * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find + * the top of the kernel stack. Use an extra percpu variable to track the + * top of the kernel stack directly. + */ +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = + (unsigned long)&init_thread_union + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 0405cab6634d..1b9963faf4eb 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -306,13 +306,16 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) arch_end_context_switch(next_p); /* - * Reload esp0. This changes current_thread_info(). + * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * current_thread_info(). */ load_sp0(tss, next); - this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aabc72e..759388c538cf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -806,6 +806,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + per_cpu(cpu_current_top_of_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index fa290586ed37..081252c44cde 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON((unsigned long)(this_cpu_sp0() - current_stack_pointer()) >= - THREAD_SIZE); + BUG_ON((unsigned long)(current_top_of_stack() - + current_stack_pointer()) >= THREAD_SIZE); preempt_count_sub(HARDIRQ_OFFSET); } From 3e1aa7cb59aff4b245b45e326fcdba1bf7f105c6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 6 Mar 2015 21:55:32 +0100 Subject: [PATCH 057/148] x86/asm: Optimize unnecessarily wide TEST instructions By the nature of the TEST operation, it is often possible to test a narrower part of the operand: "testl $3, mem" -> "testb $3, mem", "testq $3, %rcx" -> "testb $3, %cl" This results in shorter instructions, because the TEST instruction has no sign-entending byte-immediate forms unlike other ALU ops. Note that this change does not create any LCP (Length-Changing Prefix) stalls, which happen when adding a 0x66 prefix, which happens when 16-bit immediates are used, which changes such TEST instructions: [test_opcode] [modrm] [imm32] to: [0x66] [test_opcode] [modrm] [imm16] where [imm16] has a *different length* now: 2 bytes instead of 4. This confuses the decoder and slows down execution. REX prefixes were carefully designed to almost never hit this case: adding REX prefix does not change instruction length except MOVABS and MOV [addr],RAX instruction. This patch does not add instructions which would use a 0x66 prefix, code changes in assembly are: -48 f7 07 01 00 00 00 testq $0x1,(%rdi) +f6 07 01 testb $0x1,(%rdi) -48 f7 c1 01 00 00 00 test $0x1,%rcx +f6 c1 01 test $0x1,%cl -48 f7 c1 02 00 00 00 test $0x2,%rcx +f6 c1 02 test $0x2,%cl -41 f7 c2 01 00 00 00 test $0x1,%r10d +41 f6 c2 01 test $0x1,%r10b -48 f7 c1 04 00 00 00 test $0x4,%rcx +f6 c1 04 test $0x4,%cl -48 f7 c1 08 00 00 00 test $0x8,%rcx +f6 c1 08 test $0x8,%cl Linus further notes: "There are no stalls from using 8-bit instruction forms. Now, changing from 64-bit or 32-bit 'test' instructions to 8-bit ones *could* cause problems if it ends up having forwarding issues, so that instead of just forwarding the result, you end up having to wait for it to be stable in the L1 cache (or possibly the register file). The forwarding from the store buffer is simplest and most reliable if the read is done at the exact same address and the exact same size as the write that gets forwarded. But that's true only if: (a) the write was very recent and is still in the write queue. I'm not sure that's the case here anyway. (b) on at least most Intel microarchitectures, you have to test a different byte than the lowest one (so forwarding a 64-bit write to a 8-bit read ends up working fine, as long as the 8-bit read is of the low 8 bits of the written data). A very similar issue *might* show up for registers too, not just memory writes, if you use 'testb' with a high-byte register (where instead of forwarding the value from the original producer it needs to go through the register file and then shifted). But it's mainly a problem for store buffers. But afaik, the way Denys changed the test instructions, neither of the above issues should be true. The real problem for store buffer forwarding tends to be "write 8 bits, read 32 bits". That can be really surprisingly expensive, because the read ends up having to wait until the write has hit the cacheline, and we might talk tens of cycles of latency here. But "write 32 bits, read the low 8 bits" *should* be fast on pretty much all x86 chips, afaik." Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: H. Peter Anvin Cc: Kees Cook Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425675332-31576-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_64.S | 2 +- arch/x86/kernel/relocate_kernel_32.S | 8 ++++---- arch/x86/kernel/relocate_kernel_64.S | 8 ++++---- arch/x86/lib/checksum_32.S | 4 ++-- arch/x86/lib/csum-copy_64.S | 2 +- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9a0919678f97..ae6588b301c2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -146,7 +146,7 @@ startup_64: leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ -1: testq $1, 0(%rdi) +1: testb $1, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7c22a6..77630d57e7bf 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -226,23 +226,23 @@ swap_pages: movl (%ebx), %ecx addl $4, %ebx 1: - testl $0x1, %ecx /* is it a destination page */ + testb $0x1, %cl /* is it a destination page */ jz 2f movl %ecx, %edi andl $0xfffff000, %edi jmp 0b 2: - testl $0x2, %ecx /* is it an indirection page */ + testb $0x2, %cl /* is it an indirection page */ jz 2f movl %ecx, %ebx andl $0xfffff000, %ebx jmp 0b 2: - testl $0x4, %ecx /* is it the done indicator */ + testb $0x4, %cl /* is it the done indicator */ jz 2f jmp 3f 2: - testl $0x8, %ecx /* is it the source indicator */ + testb $0x8, %cl /* is it the source indicator */ jz 0b /* Ignore it otherwise */ movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c693e475..04cb1790a596 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -221,23 +221,23 @@ swap_pages: movq (%rbx), %rcx addq $8, %rbx 1: - testq $0x1, %rcx /* is it a destination page? */ + testb $0x1, %cl /* is it a destination page? */ jz 2f movq %rcx, %rdi andq $0xfffffffffffff000, %rdi jmp 0b 2: - testq $0x2, %rcx /* is it an indirection page? */ + testb $0x2, %cl /* is it an indirection page? */ jz 2f movq %rcx, %rbx andq $0xfffffffffffff000, %rbx jmp 0b 2: - testq $0x4, %rcx /* is it the done indicator? */ + testb $0x4, %cl /* is it the done indicator? */ jz 2f jmp 3f 2: - testq $0x8, %rcx /* is it the source indicator? */ + testb $0x8, %cl /* is it the source indicator? */ jz 0b /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index c3b9953d3fa0..9bc944a91274 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -125,7 +125,7 @@ ENTRY(csum_partial) 6: addl %ecx,%eax adcl $0, %eax 7: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 8f roll $8, %eax 8: @@ -245,7 +245,7 @@ ENTRY(csum_partial) addl %ebx,%eax adcl $0,%eax 80: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 90f roll $8, %eax 90: diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 2419d5fefae3..9734182966f3 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic) /* handle last odd byte */ .Lhandle_1: - testl $1, %r10d + testb $1, %r10b jz .Lende xorl %ebx, %ebx source From 29722cd4ef666705b2eda1c3ba44435488e509eb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 9 Mar 2015 19:39:21 +0100 Subject: [PATCH 058/148] x86/asm/entry/64: Save R11 into pt_regs->flags on SYSCALL64 fastpath Before this patch, R11 was saved in pt_regs->r11. Which looks natural, but requires messy shuffling to/from iret frame whenever ptrace or e.g. sys_iopl() wants to modify flags - because that's how this register is used by SYSCALL/SYSRET. This patch saves R11 in pt_regs->flags, and uses that value for the SYSRET64 instruction. Shuffling is eliminated. FIXUP/RESTORE_TOP_OF_STACK are simplified. stub_iopl is no longer needed: pt_regs->flags needs no fixing up. Testing shows that syscall fast path is ~54.3 ns before and after the patch (on 2.7 GHz Sandy Bridge CPU). Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425926364-9526-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 20 ++++++++++++++------ arch/x86/kernel/entry_64.S | 24 +++++++++++------------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index f1a962ff7ddf..4b5f7bf2b780 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -95,9 +95,11 @@ For 32-bit we have the following conventions - kernel is built with CFI_ADJUST_CFA_OFFSET 15*8+\addskip .endm - .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8plus=1 - .if \r8plus + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 movq_cfi r11, 6*8+\offset + .endif + .if \r8910 movq_cfi r10, 7*8+\offset movq_cfi r9, 8*8+\offset movq_cfi r8, 9*8+\offset @@ -113,16 +115,19 @@ For 32-bit we have the following conventions - kernel is built with movq_cfi rdi, 14*8+\offset .endm .macro SAVE_C_REGS offset=0 - SAVE_C_REGS_HELPER \offset, 1, 1, 1 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 .endm .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 - SAVE_C_REGS_HELPER \offset, 0, 0, 1 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 .endm .macro SAVE_C_REGS_EXCEPT_R891011 - SAVE_C_REGS_HELPER 0, 1, 1, 0 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 .endm .macro SAVE_C_REGS_EXCEPT_RCX_R891011 - SAVE_C_REGS_HELPER 0, 1, 0, 0 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 .endm .macro SAVE_EXTRA_REGS offset=0 @@ -179,6 +184,9 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_EXCEPT_R11 RESTORE_C_REGS_HELPER 1,1,0,1,1 .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm .macro RESTORE_RSI_RDI RESTORE_C_REGS_HELPER 0,0,0,0,0 .endm diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 5117a2baefe9..324200aca431 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -121,14 +121,12 @@ ENDPROC(native_usergs_sysret64) #endif /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. + * C code is not supposed to know that the iret frame is not populated. + * Every time a C function with an pt_regs argument is called from + * the SYSCALL based fast path FIXUP_TOP_OF_STACK is needed. * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs * manipulation. */ - - /* %rsp:at FRAMEEND */ .macro FIXUP_TOP_OF_STACK tmp offset=0 movq PER_CPU_VAR(old_rsp),\tmp movq \tmp,RSP+\offset(%rsp) @@ -136,15 +134,13 @@ ENDPROC(native_usergs_sysret64) movq $__USER_CS,CS+\offset(%rsp) movq RIP+\offset(%rsp),\tmp /* get rip */ movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ - movq R11+\offset(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS+\offset(%rsp) + movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */ + movq \tmp,R11+\offset(%rsp) .endm .macro RESTORE_TOP_OF_STACK tmp offset=0 movq RSP+\offset(%rsp),\tmp movq \tmp,PER_CPU_VAR(old_rsp) - movq EFLAGS+\offset(%rsp),\tmp - movq \tmp,R11+\offset(%rsp) .endm /* @@ -257,9 +253,10 @@ GLOBAL(system_call_after_swapgs) */ ENABLE_INTERRUPTS(CLBR_NONE) ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ - SAVE_C_REGS_EXCEPT_RAX_RCX + SAVE_C_REGS_EXCEPT_RAX_RCX_R11 movq $-ENOSYS,RAX(%rsp) movq_cfi rax,ORIG_RAX + movq %r11,EFLAGS(%rsp) movq %rcx,RIP(%rsp) CFI_REL_OFFSET rip,RIP testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) @@ -277,7 +274,7 @@ system_call_fastpath: movq %rax,RAX(%rsp) /* * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. + * Has incompletely filled pt_regs, iret frame is also incomplete. */ ret_from_sys_call: testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) @@ -291,9 +288,10 @@ ret_from_sys_call: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON - RESTORE_C_REGS_EXCEPT_RCX - movq RIP(%rsp),%rcx + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx + movq EFLAGS(%rsp),%r11 /*CFI_REGISTER rflags,r11*/ movq PER_CPU_VAR(old_rsp), %rsp /* From 616ab249f1e42f6135642183529f910fcedc2642 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 10 Mar 2015 11:45:06 +0100 Subject: [PATCH 059/148] x86/asm/entry/64: Remove stub_iopl stub_iopl is no longer needed: pt_regs->flags needs no fixing up after previous change. Remove it. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425984307-2143-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 13 ------------- arch/x86/syscalls/syscall_64.tbl | 2 +- arch/x86/um/sys_call_table_64.c | 2 +- 3 files changed, 2 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 324200aca431..703ced057199 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -421,22 +421,9 @@ ENTRY(stub_\func) END(stub_\func) .endm - .macro FIXED_FRAME label,func -ENTRY(\label) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 /* offset 8: return address */ - FIXUP_TOP_OF_STACK %r11, 8 - call \func - RESTORE_TOP_OF_STACK %r11, 8 - ret - CFI_ENDPROC -END(\label) - .endm - FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork - FIXED_FRAME stub_iopl, sys_iopl ENTRY(stub_execve) CFI_STARTPROC diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 8d656fbb57aa..9ef32d5f1b19 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -178,7 +178,7 @@ 169 common reboot sys_reboot 170 common sethostname sys_sethostname 171 common setdomainname sys_setdomainname -172 common iopl stub_iopl +172 common iopl sys_iopl 173 common ioperm sys_ioperm 174 64 create_module 175 common init_module sys_init_module diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5cdfa9db2217..a75d8700472a 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -16,7 +16,7 @@ */ /* Not going to be implemented by UML, since we have no hardware. */ -#define stub_iopl sys_ni_syscall +#define sys_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall /* From 263042e4630a85e856b4a8cd72f28dab33ef4741 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 9 Mar 2015 19:39:23 +0100 Subject: [PATCH 060/148] x86/asm/entry/64: Save user RSP in pt_regs->sp on SYSCALL64 fastpath Prepare for the removal of 'usersp', by simplifying PER_CPU(old_rsp) usage: - use it only as temp storage - store the userspace stack pointer immediately in pt_regs->sp on syscall entry, instead of using it later, on syscall exit. - change C code to use pt_regs->sp only, instead of PER_CPU(old_rsp) and task->thread.usersp. FIXUP/RESTORE_TOP_OF_STACK are simplified as well. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425926364-9526-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/compat.h | 2 +- arch/x86/include/asm/ptrace.h | 8 ++------ arch/x86/kernel/entry_64.S | 18 +++++++----------- arch/x86/kernel/perf_regs.c | 2 +- arch/x86/kernel/process_64.c | 3 +-- 5 files changed, 12 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = this_cpu_read(old_rsp) - 128; + sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 4077d963a1a0..74bb2e0f3030 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -145,12 +145,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #endif } -#define current_user_stack_pointer() this_cpu_read(old_rsp) -/* ia32 vs. x32 difference */ -#define compat_user_stack_pointer() \ - (test_thread_flag(TIF_IA32) \ - ? current_pt_regs()->sp \ - : this_cpu_read(old_rsp)) +#define current_user_stack_pointer() current_pt_regs()->sp +#define compat_user_stack_pointer() current_pt_regs()->sp #endif #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 703ced057199..d86788c3257b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -128,8 +128,6 @@ ENDPROC(native_usergs_sysret64) * manipulation. */ .macro FIXUP_TOP_OF_STACK tmp offset=0 - movq PER_CPU_VAR(old_rsp),\tmp - movq \tmp,RSP+\offset(%rsp) movq $__USER_DS,SS+\offset(%rsp) movq $__USER_CS,CS+\offset(%rsp) movq RIP+\offset(%rsp),\tmp /* get rip */ @@ -139,8 +137,7 @@ ENDPROC(native_usergs_sysret64) .endm .macro RESTORE_TOP_OF_STACK tmp offset=0 - movq RSP+\offset(%rsp),\tmp - movq \tmp,PER_CPU_VAR(old_rsp) + /* nothing to do */ .endm /* @@ -222,9 +219,6 @@ ENDPROC(native_usergs_sysret64) * Interrupts are off on entry. * Only called from user space. * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - * * When user can change the frames always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. @@ -253,11 +247,13 @@ GLOBAL(system_call_after_swapgs) */ ENABLE_INTERRUPTS(CLBR_NONE) ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ + movq %rcx,RIP(%rsp) + movq PER_CPU_VAR(old_rsp),%rcx + movq %r11,EFLAGS(%rsp) + movq %rcx,RSP(%rsp) + movq_cfi rax,ORIG_RAX SAVE_C_REGS_EXCEPT_RAX_RCX_R11 movq $-ENOSYS,RAX(%rsp) - movq_cfi rax,ORIG_RAX - movq %r11,EFLAGS(%rsp) - movq %rcx,RIP(%rsp) CFI_REL_OFFSET rip,RIP testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) jnz tracesys @@ -293,7 +289,7 @@ ret_from_sys_call: CFI_REGISTER rip,rcx movq EFLAGS(%rsp),%r11 /*CFI_REGISTER rflags,r11*/ - movq PER_CPU_VAR(old_rsp), %rsp + movq RSP(%rsp),%rsp /* * 64bit SYSRET restores rip from rcx, * rflags from r11 (but RF and VM bits are forced to 0), diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861cc5ee8..02a8720414c0 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -177,7 +177,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, * than just blindly copying user_regs. */ regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - regs_user_copy->sp = this_cpu_read(old_rsp); + regs_user_copy->sp = user_regs->sp; regs_user_copy->cs = __USER_CS; regs_user_copy->ss = __USER_DS; regs_user_copy->cx = -1; /* usually contains garbage */ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1e393d27d701..e8c124a1f885 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -602,6 +602,5 @@ long sys_arch_prctl(int code, unsigned long addr) unsigned long KSTK_ESP(struct task_struct *task) { - return (test_tsk_thread_flag(task, TIF_IA32)) ? - (task_pt_regs(task)->sp) : ((task)->thread.usersp); + return task_pt_regs(task)->sp; } From c6f2062935c8fcb31235799eaee8bcd5b649936b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Mar 2015 13:57:51 -0700 Subject: [PATCH 061/148] x86/signal/64: Fix SS handling for signals delivered to 64-bit programs The comment in the signal code says that apps can save/restore other segments on their own. It's true that apps can *save* SS on their own, but there's no way for apps to restore it: SYSCALL effectively resets SS to __USER_DS, so any value that user code tries to load into SS gets lost on entry to sigreturn. This recycles two padding bytes in the segment selector area for SS. While we're at it, we need a second change to make this useful. If the signal we're delivering is caused by a bad SS value, saving that value isn't enough. We need to remove that bad value from the regs before we try to deliver the signal. Oddly, the i386 code already got this right. I suspect that 64-bit programs that try to run 16-bit code and use signals will have a lot of trouble without this. Signed-off-by: Andy Lutomirski Reviewed-by: Oleg Nesterov Acked-by: Borislav Petkov Cc: Al Viro Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/405594361340a2ec32f8e2b115c142df0e180d8e.1426193719.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigcontext.h | 2 +- arch/x86/include/uapi/asm/sigcontext.h | 2 +- arch/x86/kernel/signal.c | 22 +++++++++++++--------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..f910cdcb71fd 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -59,7 +59,7 @@ struct sigcontext { unsigned short cs; unsigned short gs; unsigned short fs; - unsigned short __pad0; + unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..076b11fd6fa1 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -179,7 +179,7 @@ struct sigcontext { __u16 cs; __u16 gs; __u16 fs; - __u16 __pad0; + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..e2f6061a9003 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -164,6 +157,7 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, put_user_ex(regs->cs, &sc->cs); put_user_ex(0, &sc->gs); put_user_ex(0, &sc->fs); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -457,9 +451,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. + */ regs->cs = __USER_CS; + regs->ss = __USER_DS; return 0; } From 9a036b93a344235b7899401d04e97c34f3a2554c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Mar 2015 13:57:52 -0700 Subject: [PATCH 062/148] x86/signal/64: Remove 'fs' and 'gs' from sigcontext As far as I can tell, these fields have been set to zero on save and ignored on restore since Linux was imported into git. Rename them '__pad1' and '__pad2' to avoid confusion. This may also allow us to recycle them some day. This also adds a comment clarifying the history of those fields. I'm intentionally avoiding calling either of them '__pad0': the field formerly known as '__pad0' is now 'ss'. Signed-off-by: Andy Lutomirski Reviewed-by: Oleg Nesterov Cc: Al Viro Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/844f8490e938780c03355be4c9b69eb4c494bf4e.1426193719.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/sigcontext.h | 4 ++-- arch/x86/include/uapi/asm/sigcontext.h | 19 +++++++++++++++++-- arch/x86/kernel/signal.c | 4 ++-- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index f910cdcb71fd..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,8 +57,8 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short gs; - unsigned short fs; + unsigned short __pad2; /* Was called gs, but was always zero. */ + unsigned short __pad1; /* Was called fs, but was always zero. */ unsigned short ss; unsigned long err; unsigned long trapno; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index 076b11fd6fa1..16dc4e8a2cd3 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -177,8 +177,23 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; - __u16 gs; - __u16 fs; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * If these slots are ever needed for any other purpose, there + * is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there is + * no TLS API that works in both pre- and post-2.5.64 kernels. + */ + __u16 __pad2; /* Was gs. */ + __u16 __pad1; /* Was fs. */ + __u16 ss; __u64 err; __u64 trapno; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e2f6061a9003..edcb862cdcae 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -155,8 +155,8 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, #else /* !CONFIG_X86_32 */ put_user_ex(regs->flags, &sc->flags); put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); + put_user_ex(0, &sc->__pad2); + put_user_ex(0, &sc->__pad1); put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ From 3ee4298f440c81638cbb5ec06f2497fb7a9a9eb4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Mar 2015 11:05:58 -0700 Subject: [PATCH 063/148] x86/asm/entry: Create and use a 'TOP_OF_KERNEL_STACK_PADDING' macro x86_32, unlike x86_64, pads the top of the kernel stack, because the hardware stack frame formats are variable in size. Document this padding and give it a name. This should make no change whatsoever to the compiled kernel image. It also doesn't fix any of the current bugs in this area. Signed-off-by: Andy Lutomirski Acked-by: Denys Vlasenko Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/02bf2f54b8dcb76a62a142b6dfe07d4ef7fc582e.1426009661.git.luto@amacapital.net [ Fixed small details, such as a missed magic constant in entry_32.S pointed out by Denys Vlasenko. ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 ++- arch/x86/include/asm/thread_info.h | 27 +++++++++++++++++++++++++++ arch/x86/kernel/entry_32.S | 2 +- 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 48a61c1c626e..88d9aa745898 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -849,7 +849,8 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(task) \ ({ \ struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ + __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \ + TOP_OF_KERNEL_STACK_PADDING); \ __regs__ - 1; \ }) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 7740edd56fed..ba115eb6fbcf 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -12,6 +12,33 @@ #include #include +/* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + /* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index e33ba51b1069..4c8cc34e6d68 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -398,7 +398,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax From d9e05cc5a53246e074dc2b84956252e4bbe392cd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Mar 2015 11:05:59 -0700 Subject: [PATCH 064/148] x86/asm/entry: Unify and fix initial thread_struct::sp0 values x86_32 and x86_64 need slightly different thread_struct::sp0 values, and x86_32's was incorrect for init. This never mattered -- the init thread never runs user code, so we never used thread_struct::sp0 for anything. Fix it and mostly unify them. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1b810c1d2e797e27bb4a7708c426101161edd1f6.1426009661.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 7 +++++-- arch/x86/kernel/process.c | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 88d9aa745898..fc6d8d0d8d53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -811,6 +811,9 @@ static inline void spin_lock_prefetch(const void *x) prefetchw(x); } +#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -821,7 +824,7 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX STACK_TOP #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .sp0 = TOP_OF_INIT_STACK, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ @@ -883,7 +886,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ + .sp0 = TOP_OF_INIT_STACK \ } /* diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index f4c0af7fc3a0..12b1cf606ddf 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -39,7 +39,7 @@ */ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { .x86_tss = { - .sp0 = (unsigned long)&init_stack + sizeof(init_stack), + .sp0 = TOP_OF_INIT_STACK, #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, From 76e4c4908a4904a61aa67ae5eb0b2a7588c4a546 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 10 Mar 2015 11:06:00 -0700 Subject: [PATCH 065/148] x86/asm/entry/32: Document our abuse of x86_hw_tss::ss1 and x86_hw_tss::sp1 This has confused me for a while. Now that I figured it out, document it. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b7efc1b7364039824776f68e9ddee9ec1500e894.1426009661.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fc6d8d0d8d53..b26208998b7c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -209,9 +209,24 @@ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; - unsigned long sp1; - /* ss1 caches MSR_IA32_SYSENTER_CS: */ - unsigned short ss1, __ss1h; + + /* + * We don't use ring 1, so sp1 and ss1 are convenient scratch + * spaces in the same cacheline as sp0. We use them to cache + * some MSR values to avoid unnecessary wrmsr instructions. + * + * We use SYSENTER_ESP to find sp0 and for the NMI emergency + * stack, but we need to context switch it because we do + * horrible things to the kernel stack in vm86 mode. + * + * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid + * corrupting the stack if we went through the sysenter path + * from vm86 mode. + */ + unsigned long sp1; /* MSR_IA32_SYSENTER_ESP */ + unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ + + unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; From 5c39403e004bec75ce0c549541be5479595d6ad0 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 13 Mar 2015 15:09:03 +0100 Subject: [PATCH 066/148] x86/asm/entry: Simplify task_pt_regs() macro definition Before this change, task_pt_regs() was using KSTK_TOP(), and it was the only use of that macro. In turn, KSTK_TOP used THREAD_SIZE_LONGS, and it was the only use of that macro too. Fold these macros into task_pt_regs(). Tweak comment about "- 8" - we now use a symbolic constant, not literal 8. Signed-off-by: Denys Vlasenko Reviewed-by: Steven Rostedt Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1426255743-5394-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b26208998b7c..6a5c0ec5ee0e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -847,15 +847,8 @@ static inline void spin_lock_prefetch(const void *x) extern unsigned long thread_saved_pc(struct task_struct *tsk); -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessible even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers @@ -864,12 +857,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \ - TOP_OF_KERNEL_STACK_PADDING); \ - __regs__ - 1; \ +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ }) #define KSTK_ESP(task) (task_pt_regs(task)->sp) From 3876488444e71238e287459c39d7692b6f718c3e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 9 Mar 2015 15:52:17 +0100 Subject: [PATCH 067/148] include/stddef.h: Move offsetofend() from vfio.h to a generic kernel header Suggested by Andy. Suggested-by: Andy Lutomirski Signed-off-by: Denys Vlasenko Acked-by: Linus Torvalds Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425912738-559-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- include/linux/stddef.h | 9 +++++++++ include/linux/vfio.h | 13 ------------- 2 files changed, 9 insertions(+), 13 deletions(-) diff --git a/include/linux/stddef.h b/include/linux/stddef.h index f4aec0e75c3a..076af437284d 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -19,3 +19,12 @@ enum { #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif #endif + +/** + * offsetofend(TYPE, MEMBER) + * + * @TYPE: The type of the structure + * @MEMBER: The member within the structure to get the end offset of + */ +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 2d67b8998fd8..049b2f497bc7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -78,19 +78,6 @@ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); extern void vfio_unregister_iommu_driver( const struct vfio_iommu_driver_ops *ops); -/** - * offsetofend(TYPE, MEMBER) - * - * @TYPE: The type of the structure - * @MEMBER: The member within the structure to get the end offset of - * - * Simple helper macro for dealing with variable sized structures passed - * from user space. This allows us to easily determine if the provided - * structure is sized to include various fields. - */ -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) - /* * External user API */ From d828c71fba8922b116b4ec56c3e5bca8c822d5ae Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 9 Mar 2015 15:52:18 +0100 Subject: [PATCH 068/148] x86/asm/entry/32: Document the 32-bit SYSENTER "emergency stack" better Before the patch, the 'tss_struct::stack' field was not referenced anywhere. It was used only to set SYSENTER's stack to point after the last byte of tss_struct, thus the trailing field, stack[64], was used. But grep would not know it. You can comment it out, compile, and kernel will even run until an unlucky NMI corrupts io_bitmap[] (which is also not easily detectable). This patch changes code so that the purpose and usage of this field is not mysterious anymore, and can be easily grepped for. This does change generated code, for a subtle reason: since tss_struct is ____cacheline_aligned, there happens to be 5 longs of padding at the end. Old code was using the padding too; new code will strictly use it only for SYSENTER_stack[]. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1425912738-559-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 ++-- arch/x86/kernel/asm-offsets_32.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6a5c0ec5ee0e..5abd9a535a24 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -291,9 +291,9 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* - * .. and then another 0x100 bytes for the emergency kernel stack: + * Space for the temporary SYSENTER stack: */ - unsigned long stack[64]; + unsigned long SYSENTER_stack[64]; } ____cacheline_aligned; diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -68,7 +68,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - sizeof(struct tss_struct)); + offsetofend(struct tss_struct, SYSENTER_stack)); #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 76348334b934..7a3dfb1db78d 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -987,7 +987,7 @@ void enable_sep_cpu(void) } tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; + tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack); wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); From 8b6c0ab1a1296ef6922160fa27018f25c60b8940 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Mar 2015 10:32:20 +0100 Subject: [PATCH 069/148] x86/asm/entry: Document and clean up the enable_sep_cpu() and syscall32_cpu_init() functions Clean up the flow and document the functions a bit better. Cc: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 42 +++++++++++++++++++++++++----------- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 7a3dfb1db78d..d79f139871e0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -960,37 +960,53 @@ static void identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_64 -#ifdef CONFIG_IA32_EMULATION +# ifdef CONFIG_IA32_EMULATION /* May not be __init: called during resume */ static void syscall32_cpu_init(void) { - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ + /* + * Always load these, in case some future 64-bit CPU supports + * SYSENTER from compat mode too: + */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); wrmsrl(MSR_CSTAR, ia32_cstar_target); } -#endif /* CONFIG_IA32_EMULATION */ -#endif /* CONFIG_X86_64 */ +# endif +#endif +/* + * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions + * on 32-bit kernels: + */ #ifdef CONFIG_X86_32 void enable_sep_cpu(void) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss; + int cpu; - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } + cpu = get_cpu(); + tss = &per_cpu(cpu_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) + goto out; + + /* + * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware, + * we cache the SYSENTER CS and ESP values there for easy access: + */ tss->x86_tss.ss1 = __KERNEL_CS; + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack); - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + +out: put_cpu(); } #endif From 33db1fd48ac3d90385b412b41a8a6525096ac6d5 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 17 Mar 2015 14:52:24 +0100 Subject: [PATCH 070/148] x86/asm/entry/64: Enable interrupts *after* we fetch PER_CPU_VAR(old_rsp) We want to use PER_CPU_VAR(old_rsp) as a simple temporary register, to shuffle user-space RSP into (and from) when we set up the system call stack frame. At that point we cannot shuffle values into general purpose registers, because we have not saved them yet. To be able to do this shuffling into a memory location, we must be atomic and must not be preempted while we do the shuffling, otherwise the 'temporary' register gets overwritten by some other task's temporary register contents ... Tested-by: Borislav Petkov Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426600344-8254-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d86788c3257b..aed3f11c373b 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -241,16 +241,16 @@ GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(old_rsp) /* kernel_stack is set so that 5 slots (iret frame) are preallocated */ movq PER_CPU_VAR(kernel_stack),%rsp - /* - * No need to follow this irqs off/on section - it's straight - * and short: - */ - ENABLE_INTERRUPTS(CLBR_NONE) ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ movq %rcx,RIP(%rsp) movq PER_CPU_VAR(old_rsp),%rcx movq %r11,EFLAGS(%rsp) movq %rcx,RSP(%rsp) + /* + * No need to follow this irqs off/on section - it's straight + * and short: + */ + ENABLE_INTERRUPTS(CLBR_NONE) movq_cfi rax,ORIG_RAX SAVE_C_REGS_EXCEPT_RAX_RCX_R11 movq $-ENOSYS,RAX(%rsp) From 9854dd74c3f6af8d9d527de86c6074b7ed0495f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Mar 2015 14:42:59 +0100 Subject: [PATCH 071/148] x86/asm/entry/64: Simplify 'old_rsp' usage Remove all manipulations of PER_CPU(old_rsp) in C code: - it is not used on SYSRET return anymore, and system entries are atomic, so updating it from the fork and context switch paths is pointless. - Tweak a few related comments as well: we no longer have a "partial stack frame" on entry, ever. Based on (split out of) patch from Denys Vlasenko. Originally-from: Denys Vlasenko Tested-by: Borislav Petkov Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426599779-8010-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 5 ----- arch/x86/kernel/process_64.c | 2 -- 2 files changed, 7 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 5abd9a535a24..3ac5092ec113 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -905,11 +905,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); -/* - * User space RSP while inside the SYSCALL fast path - */ -DECLARE_PER_CPU(unsigned long, old_rsp); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index e8c124a1f885..59696d76a4e3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -238,7 +238,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; @@ -399,7 +398,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * Switch the PDA and FPU contexts. */ prev->usersp = this_cpu_read(old_rsp); - this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); /* From ac9af4983e77765a642b5a21086bc1fdc55418c4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Mar 2015 14:42:59 +0100 Subject: [PATCH 072/148] x86/asm/entry/64: Remove thread_struct::usersp Nothing uses thread_struct::usersp anymore, so remove it. Originally-from: Denys Vlasenko Tested-by: Borislav Petkov Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 1 - arch/x86/kernel/process_64.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3ac5092ec113..572099710ba2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -493,7 +493,6 @@ struct thread_struct { #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else - unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 59696d76a4e3..14df2be4711f 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); p->thread.io_bitmap_ptr = NULL; @@ -235,7 +234,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); - current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -397,7 +395,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = this_cpu_read(old_rsp); this_cpu_write(current_task, next_p); /* From 7fcb3bc361c724a75bc642dbdd9d9bf0bdf07260 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Mar 2015 14:42:59 +0100 Subject: [PATCH 073/148] x86/asm/entry/64: Update comments about stack frames Tweak a few outdated comments that were obsoleted by recent changes to syscall entry code: - we no longer have a "partial stack frame" on entry, ever. - explain the syscall entry usage of old_rsp. Partially based on a (split out of) patch from Denys Vlasenko. Originally-from: Denys Vlasenko Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index aed3f11c373b..d287f785089e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -15,10 +15,8 @@ * after an interrupt and after each system call. * * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP + * - iret frame: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers up to R11. - * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better @@ -219,7 +217,7 @@ ENDPROC(native_usergs_sysret64) * Interrupts are off on entry. * Only called from user space. * - * When user can change the frames always force IRET. That is because + * When user can change pt_regs->foo always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */ @@ -238,6 +236,11 @@ ENTRY(system_call) */ GLOBAL(system_call_after_swapgs) + /* + * We use 'old_rsp' as a scratch register, hence this block must execute + * atomically in the face of possible interrupt-driven task preemption, + * so we can enable interrupts only after we're done with using old_rsp: + */ movq %rsp,PER_CPU_VAR(old_rsp) /* kernel_stack is set so that 5 slots (iret frame) are preallocated */ movq PER_CPU_VAR(kernel_stack),%rsp @@ -303,7 +306,7 @@ int_ret_from_sys_call_fixup: FIXUP_TOP_OF_STACK %r11 jmp int_ret_from_sys_call - /* Do syscall tracing */ + /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi movq $AUDIT_ARCH_X86_64, %rsi @@ -339,11 +342,11 @@ tracesys_phase2: movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) movq %rax,RAX(%rsp) - /* Use IRET because user could have changed frame */ + /* Use IRET because user could have changed pt_regs->foo */ /* * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. + * Has correct iret frame. */ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) @@ -374,7 +377,7 @@ int_careful: TRACE_IRQS_OFF jmp int_with_check - /* handle signals and tracing -- both require a full stack frame */ + /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) From c38e503804b0402c510f82437069f7769fa0cea9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 17 Mar 2015 14:42:59 +0100 Subject: [PATCH 074/148] x86/asm/entry/64: Rename 'old_rsp' to 'rsp_scratch' Make clear that the usage of PER_CPU(old_rsp) is purely temporary, by renaming it to 'rsp_scratch'. Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 10 +++++----- arch/x86/kernel/process_64.c | 2 +- arch/x86/xen/xen-asm_64.S | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index d287f785089e..0c91256d73df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -237,16 +237,16 @@ ENTRY(system_call) GLOBAL(system_call_after_swapgs) /* - * We use 'old_rsp' as a scratch register, hence this block must execute + * We use 'rsp_scratch' as a scratch register, hence this block must execute * atomically in the face of possible interrupt-driven task preemption, - * so we can enable interrupts only after we're done with using old_rsp: + * so we can enable interrupts only after we're done with using rsp_scratch: */ - movq %rsp,PER_CPU_VAR(old_rsp) + movq %rsp,PER_CPU_VAR(rsp_scratch) /* kernel_stack is set so that 5 slots (iret frame) are preallocated */ movq PER_CPU_VAR(kernel_stack),%rsp ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ movq %rcx,RIP(%rsp) - movq PER_CPU_VAR(old_rsp),%rcx + movq PER_CPU_VAR(rsp_scratch),%rcx movq %r11,EFLAGS(%rsp) movq %rcx,RSP(%rsp) /* @@ -657,7 +657,7 @@ common_interrupt: ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ - /* 0(%rsp): old_rsp */ + /* 0(%rsp): rsp_scratch */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 14df2be4711f..97f5658290b7 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -__visible DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 53adefda4275..985fc3ee0973 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -68,11 +68,11 @@ ENTRY(xen_sysret64) * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ - movq %rsp, PER_CPU_VAR(old_rsp) + movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack), %rsp pushq $__USER_DS - pushq PER_CPU_VAR(old_rsp) + pushq PER_CPU_VAR(rsp_scratch) pushq %r11 pushq $__USER_CS pushq %rcx @@ -87,11 +87,11 @@ ENTRY(xen_sysret32) * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ - movq %rsp, PER_CPU_VAR(old_rsp) + movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack), %rsp pushq $__USER32_DS - pushq PER_CPU_VAR(old_rsp) + pushq PER_CPU_VAR(rsp_scratch) pushq %r11 pushq $__USER32_CS pushq %rcx From 1daeaa315164c60b937f56fe3848d4328c358eba Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 21 Mar 2015 18:54:21 -0400 Subject: [PATCH 075/148] x86/asm/entry: Fix execve() and sigreturn() syscalls to always return via IRET Both the execve() and sigreturn() family of syscalls have the ability to change registers in ways that may not be compatabile with the syscall path they were called from. In particular, SYSRET and SYSEXIT can't handle non-default %cs and %ss, and some bits in eflags. These syscalls have stubs that are hardcoded to jump to the IRET path, and not return to the original syscall path. The following commit: 76f5df43cab5e76 ("Always allocate a complete "struct pt_regs" on the kernel stack") recently changed this for some 32-bit compat syscalls, but introduced a bug where execve from a 32-bit program to a 64-bit program would fail because it still returned via SYSRETL. This caused Wine to fail when built for both 32-bit and 64-bit. This patch sets TIF_NOTIFY_RESUME for execve() and sigreturn() so that the IRET path is always taken on exit to userspace. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1426978461-32089-1-git-send-email-brgerst@gmail.com [ Improved the changelog and comments. ] Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 2 ++ arch/x86/include/asm/ptrace.h | 2 +- arch/x86/include/asm/thread_info.h | 10 ++++++++++ arch/x86/kernel/process_32.c | 6 +----- arch/x86/kernel/process_64.c | 1 + arch/x86/kernel/signal.c | 2 ++ 6 files changed, 17 insertions(+), 6 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index d0165c9a2932..1f5e2b0e09ff 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -203,6 +203,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, err |= restore_xstate_sig(buf, 1); + force_iret(); + return err; } diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 74bb2e0f3030..83b874da2762 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -251,7 +251,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, */ #define arch_ptrace_stop_needed(code, info) \ ({ \ - set_thread_flag(TIF_NOTIFY_RESUME); \ + force_iret(); \ false; \ }) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ba115eb6fbcf..0abf7ab20ce2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -260,6 +260,16 @@ static inline bool is_ia32_task(void) #endif return false; } + +/* + * Force syscall return via IRET by making it look as if there was + * some work pending. IRET is our most capable (but slowest) syscall + * return path, which is able to restore modified SS, CS and certain + * EFLAGS values that other (fast) syscall return instructions + * are not able to restore properly. + */ +#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) + #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 1b9963faf4eb..26c596d1ee07 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ip = new_ip; regs->sp = new_sp; regs->flags = X86_EFLAGS_IF; - /* - * force it to the iret return path by making it look as if there was - * some work pending. - */ - set_thread_flag(TIF_NOTIFY_RESUME); + force_iret(); } EXPORT_SYMBOL_GPL(start_thread); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 97f5658290b7..da8b74598d90 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -239,6 +239,7 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; + force_iret(); } void diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index edcb862cdcae..eaa2c5e3f2cd 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -108,6 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + force_iret(); + return err; } From d31bf07f71a5568b48c5ed448e4299050469f615 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:27 -0700 Subject: [PATCH 076/148] x86/mm/fault: Use TASK_SIZE_MAX in is_prefetch() This is slightly shorter and slightly faster. It's also more correct: the split between user and kernel addresses is TASK_SIZE_MAX, regardless of ti->flags. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/09156b63bad90a327827003c9e53faa82ef4c56e.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ede025fb46f1..ae340d3761ca 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) return 0; while (instr < max_instr) { From c56716af8d27ca8dd6e45445ae1c0a05fd9753a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:28 -0700 Subject: [PATCH 077/148] x86/asm/entry, perf: Fix incorrect TIF_IA32 check in code_segment_base() We want to check whether user code is in 32-bit mode, not whether the task is nominally 32-bit. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/33e5107085ce347a8303560302b15c2cadd62c4c.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..979963bb3977 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2161,10 +2161,9 @@ static unsigned long code_segment_base(struct pt_regs *regs) if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else - if (test_thread_flag(TIF_IA32)) { - if (user_mode(regs) && regs->cs != __USER32_CS) - return get_segment_base(regs->cs); - } + if (user_mode(regs) && !user_64bit_mode(regs) && + regs->cs != __USER32_CS) + return get_segment_base(regs->cs); #endif return 0; } From a67e7277d01ccfd39b0db5a198c2643cc19dd79c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:29 -0700 Subject: [PATCH 078/148] x86/asm/entry: Add user_mode_ignore_vm86() user_mode() is dangerous and user_mode_vm() has a confusing name. Add user_mode_ignore_vm86() (equivalent to current user_mode()). We'll change the small number of legitimate users of user_mode() to user_mode_ignore_vm86(). Inspired by grsec, although this works rather differently. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/202c56ca63823c338af8e2e54948dbe222da6343.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 83b874da2762..4a040f0078f2 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -121,6 +121,23 @@ static inline int user_mode_vm(struct pt_regs *regs) #endif } +/* + * This is the fastest way to check whether regs come from user space. + * It is unsafe if regs might come from vm86 mode, though -- in vm86 + * mode, all bits of CS and SS are completely under the user's control. + * The CPU considers vm86 mode to be CPL 3 regardless of CS and SS. + * + * Do NOT use this function unless you have already ruled out the + * possibility that regs came from vm86 mode. + * + * We check for RPL != 0 instead of RPL == 3 because we don't use rings + * 1 or 2 and this is more efficient. + */ +static inline int user_mode_ignore_vm86(struct pt_regs *regs) +{ + return (regs->cs & SEGMENT_RPL_MASK) != 0; +} + static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 From 383f3af3f88aadafe1fcf1948987ad538683fb8c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:30 -0700 Subject: [PATCH 079/148] x86/asm/entry, perf: Explicitly optimize vm86 handling in code_segment_base() There's no point in checking the VM bit on 64-bit, and, since we're explicitly checking it, we can use user_mode_ignore_vm86() after the check. While we're at it, rearrange the #ifdef slightly to make the code flow a bit clearer. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/dc1457a734feccd03a19bb3538a7648582f57cdd.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 979963bb3977..56f7e60ad732 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2146,6 +2146,12 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) */ static unsigned long code_segment_base(struct pt_regs *regs) { + /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ + +#ifdef CONFIG_X86_32 /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. @@ -2153,12 +2159,7 @@ static unsigned long code_segment_base(struct pt_regs *regs) if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; - /* - * For IA32 we look at the GDT/LDT segment base to convert the - * effective IP to a linear address. - */ -#ifdef CONFIG_X86_32 - if (user_mode(regs) && regs->cs != __USER_CS) + if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && From ae60f0710ae6b33092267ef8ac853c498f6d3e5d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:31 -0700 Subject: [PATCH 080/148] x86/asm/entry: Use user_mode_ignore_vm86() where appropriate A few of the user_mode() checks in traps.c are immediately after explicit checks for vm86 mode. Change them to user_mode_ignore_vm86(). Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0b324d5b75c3402be07f8d3c6245ed7f4995029e.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 277341128b47..1136961a679b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -208,7 +208,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } #endif - if (!user_mode(regs)) { + if (!user_mode_ignore_vm86(regs)) { if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; @@ -471,7 +471,7 @@ do_general_protection(struct pt_regs *regs, long error_code) #endif tsk = current; - if (!user_mode(regs)) { + if (!user_mode_ignore_vm86(regs)) { if (fixup_exception(regs)) goto exit; @@ -688,7 +688,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) * We already checked v86 mode above, so we can check for kernel mode * by just checking the CPL of CS. */ - if ((dr6 & DR_STEP) && !user_mode(regs)) { + if ((dr6 & DR_STEP) && !user_mode_ignore_vm86(regs)) { tsk->thread.debugreg6 &= ~DR_STEP; set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; From efa704510342b81ae58d7b8a0c7f676a4289b603 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:32 -0700 Subject: [PATCH 081/148] x86/asm/entry: Make user_mode() work correctly if regs came from VM86 mode user_mode() is now identical to user_mode_vm(). Subsequent patches will change all callers of user_mode_vm() to user_mode() and then delete user_mode_vm(). Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0dd03eacb5f0a2b5ba0240de25347a31b493c289.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 4a040f0078f2..70c439f9071b 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -96,11 +96,13 @@ static inline unsigned long regs_return_value(struct pt_regs *regs) } /* - * user_mode_vm(regs) determines whether a register set came from user mode. - * This is true if V8086 mode was enabled OR if the register set was from - * protected mode with RPL-3 CS value. This tricky test checks that with - * one comparison. Many places in the kernel can bypass this full check - * if they have already ruled out V8086 mode, so user_mode(regs) can be used. + * user_mode(regs) determines whether a register set came from user + * mode. On x86_32, this is true if V8086 mode was enabled OR if the + * register set was from protected mode with RPL-3 CS value. This + * tricky test checks that with one comparison. + * + * On x86_64, vm86 mode is mercifully nonexistent, and we don't need + * the extra check. */ static inline int user_mode(struct pt_regs *regs) { @@ -113,12 +115,7 @@ static inline int user_mode(struct pt_regs *regs) static inline int user_mode_vm(struct pt_regs *regs) { -#ifdef CONFIG_X86_32 - return ((regs->cs & SEGMENT_RPL_MASK) | (regs->flags & X86_VM_MASK)) >= - USER_RPL; -#else return user_mode(regs); -#endif } /* From f39b6f0ef855a38ea17329a4e621ff97750dfcc2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:33 -0700 Subject: [PATCH 082/148] x86/asm/entry: Change all 'user_mode_vm()' calls to 'user_mode()' user_mode_vm() and user_mode() are now the same. Change all callers of user_mode_vm() to user_mode(). The next patch will remove the definition of user_mode_vm. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/43b1f57f3df70df5a08b0925897c660725015554.1426728647.git.luto@kernel.org [ Merged to a more recent kernel. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 +- arch/x86/kernel/crash.c | 2 +- arch/x86/kernel/dumpstack.c | 4 ++-- arch/x86/kernel/dumpstack_32.c | 4 ++-- arch/x86/kernel/i387.c | 2 +- arch/x86/kernel/irq_32.c | 2 +- arch/x86/kernel/irq_64.c | 2 +- arch/x86/kernel/kgdb.c | 4 ++-- arch/x86/kernel/kprobes/core.c | 4 ++-- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/ptrace.c | 2 +- arch/x86/kernel/time.c | 2 +- arch/x86/kernel/traps.c | 16 ++++++++-------- arch/x86/kernel/uprobes.c | 2 +- arch/x86/mm/fault.c | 6 +++--- arch/x86/oprofile/backtrace.c | 2 +- drivers/misc/sgi-xp/xpc_main.c | 2 +- 17 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index af397cc98d05..5c993c94255e 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -715,7 +715,7 @@ int poke_int3_handler(struct pt_regs *regs) if (likely(!bp_patching_in_progress)) return 0; - if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) return 0; /* set up the specified breakpoint handler */ diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index aceb2f90c716..c76d3e37c6e1 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cf3df1d8d039..ab3b65639a3e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -278,7 +278,7 @@ int __die(const char *str, struct pt_regs *regs, long err) print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 - if (user_mode_vm(regs)) { + if (user_mode(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; } else { @@ -307,7 +307,7 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode_vm(regs)) + if (!user_mode(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5abd4cd4230c..39891ff50d03 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -123,13 +123,13 @@ void show_regs(struct pt_regs *regs) int i; show_regs_print_info(KERN_EMERG); - __show_regs(regs, !user_mode_vm(regs)); + __show_regs(regs, !user_mode(regs)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5651fce0b71..29c740deafec 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void) static inline bool interrupted_user_mode(void) { struct pt_regs *regs = get_irq_regs(); - return regs && user_mode_vm(regs); + return regs && user_mode(regs); } /* diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 28d28f5eb8f4..f9fd86a7fcc7 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) if (unlikely(!desc)) return false; - if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { + if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); desc->handle_irq(irq, desc); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e4b503d5558c..394e643d7830 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); - if (user_mode_vm(regs)) + if (user_mode(regs)) return; if (regs->sp >= curbase + sizeof(struct thread_info) + diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 7ec1d5f8d283..7fe3a9d377ea 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) #ifdef CONFIG_X86_32 switch (regno) { case GDB_SS: - if (!user_mode_vm(regs)) + if (!user_mode(regs)) *(unsigned long *)mem = __KERNEL_DS; break; case GDB_SP: - if (!user_mode_vm(regs)) + if (!user_mode(regs)) *(unsigned long *)mem = kernel_stack_pointer(regs); break; case GDB_GS: diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 4e3d5a9621fe..24d079604fd5 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -602,7 +602,7 @@ int kprobe_int3_handler(struct pt_regs *regs) struct kprobe *p; struct kprobe_ctlblk *kcb; - if (user_mode_vm(regs)) + if (user_mode(regs)) return 0; addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); @@ -1007,7 +1007,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, struct die_args *args = data; int ret = NOTIFY_DONE; - if (args->regs && user_mode_vm(args->regs)) + if (args->regs && user_mode(args->regs)) return ret; if (val == DIE_GPF) { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 26c596d1ee07..c5e987022ca0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long sp; unsigned short ss, gs; - if (user_mode_vm(regs)) { + if (user_mode(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; gs = get_user_gs(regs); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 1e125817cf9f..a7bc79480719 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1415,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, memset(info, 0, sizeof(*info)); info->si_signo = SIGTRAP; info->si_code = si_code; - info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; + info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; } void user_single_step_siginfo(struct task_struct *tsk, diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25adc0e16eaa..d39c09119db6 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (!user_mode_vm(regs) && in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1136961a679b..d4e265952102 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) { enum ctx_state prev_state; - if (user_mode_vm(regs)) { + if (user_mode(regs)) { /* Other than that, we're just an exception. */ prev_state = exception_enter(); } else { @@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) /* Must be before exception_exit. */ preempt_count_sub(HARDIRQ_OFFSET); - if (user_mode_vm(regs)) + if (user_mode(regs)) return exception_exit(prev_state); else rcu_nmi_exit(); @@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) * * IST exception handlers normally cannot schedule. As a special * exception, if the exception interrupted userspace code (i.e. - * user_mode_vm(regs) would return true) and the exception was not + * user_mode(regs) would return true) and the exception was not * a double fault, it can be safe to schedule. ist_begin_non_atomic() * begins a non-atomic section within an ist_enter()/ist_exit() region. * Callers are responsible for enabling interrupts themselves inside @@ -167,7 +167,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) */ void ist_begin_non_atomic(struct pt_regs *regs) { - BUG_ON(!user_mode_vm(regs)); + BUG_ON(!user_mode(regs)); /* * Sanity check: we need to be on the normal thread stack. This @@ -384,7 +384,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) goto exit; conditional_sti(regs); - if (!user_mode_vm(regs)) + if (!user_mode(regs)) die("bounds", regs, error_code); if (!cpu_feature_enabled(X86_FEATURE_MPX)) { @@ -587,7 +587,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* Copy the remainder of the stack from the current stack. */ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); - BUG_ON(!user_mode_vm(&new_stack->regs)); + BUG_ON(!user_mode(&new_stack->regs)); return new_stack; } NOKPROBE_SYMBOL(fixup_bad_iret); @@ -637,7 +637,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ - if (!dr6 && user_mode_vm(regs)) + if (!dr6 && user_mode(regs)) user_icebp = 1; /* Catch kmemcheck conditions first of all! */ @@ -721,7 +721,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; conditional_sti(regs); - if (!user_mode_vm(regs)) + if (!user_mode(regs)) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 81f8adb0679e..0b81ad67da07 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, int ret = NOTIFY_DONE; /* We are only interested in userspace traps */ - if (regs && !user_mode_vm(regs)) + if (regs && !user_mode(regs)) return NOTIFY_DONE; switch (val) { diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ae340d3761ca..181c53bac3a7 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -59,7 +59,7 @@ static nokprobe_inline int kprobes_fault(struct pt_regs *regs) int ret = 0; /* kprobe_running() needs smp_processor_id() */ - if (kprobes_built_in() && !user_mode_vm(regs)) { + if (kprobes_built_in() && !user_mode(regs)) { preempt_disable(); if (kprobe_running() && kprobe_fault_handler(regs, 14)) ret = 1; @@ -1035,7 +1035,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs) if (error_code & PF_USER) return false; - if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) + if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC)) return false; return true; @@ -1140,7 +1140,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * User-mode registers count as a user access even for any * potential system fault or CPU buglet: */ - if (user_mode_vm(regs)) { + if (user_mode(regs)) { local_irq_enable(); error_code |= PF_USER; flags |= FAULT_FLAG_USER; diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 5d04be5efb64..4e664bdb535a 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -111,7 +111,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) { struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { unsigned long stack = kernel_stack_pointer(regs); if (depth) dump_trace(NULL, regs, (unsigned long *)stack, 0, diff --git a/drivers/misc/sgi-xp/xpc_main.c b/drivers/misc/sgi-xp/xpc_main.c index 82dc5748f873..7f327121e6d7 100644 --- a/drivers/misc/sgi-xp/xpc_main.c +++ b/drivers/misc/sgi-xp/xpc_main.c @@ -1210,7 +1210,7 @@ xpc_system_die(struct notifier_block *nb, unsigned long event, void *_die_args) if (((die_args->trapnr == X86_TRAP_MF) || (die_args->trapnr == X86_TRAP_XF)) && - !user_mode_vm(die_args->regs)) + !user_mode(die_args->regs)) xpc_die_deactivate(); break; From 7a2806741e7327a6b20ccef42e8d56588cb2fef5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:34 -0700 Subject: [PATCH 083/148] x86/asm/entry: Remove user_mode_vm() It has no callers anymore. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a594afd6a0bddb1311bd7c92a15201c87fbb8681.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 70c439f9071b..d20bae298852 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -113,11 +113,6 @@ static inline int user_mode(struct pt_regs *regs) #endif } -static inline int user_mode_vm(struct pt_regs *regs) -{ - return user_mode(regs); -} - /* * This is the fastest way to check whether regs come from user space. * It is unsafe if regs might come from vm86 mode, though -- in vm86 From d74ef1118a146ae1135c8b26fff2bfee980fd7a4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Mar 2015 18:33:35 -0700 Subject: [PATCH 084/148] x86/asm/entry: Replace some open-coded VM86 checks with v8086_mode() checks This allows us to remove some unnecessary ifdefs. There should be no change to the generated code. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f7e00f0d668e253abf0bd8bf36491ac47bd761ff.1426728647.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index d4e265952102..c8eb469a94a4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -194,8 +194,7 @@ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. @@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } return -1; } -#endif + if (!user_mode_ignore_vm86(regs)) { if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; @@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code) prev_state = exception_enter(); conditional_sti(regs); -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); goto exit; } -#endif tsk = current; if (!user_mode_ignore_vm86(regs)) { @@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); preempt_conditional_cli(regs); From 34061f134f70d33296fa56678cee122dd7010401 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 23 Mar 2015 14:03:59 +0100 Subject: [PATCH 085/148] x86/asm/entry/64: Fix incorrect comment The recent old_rsp -> rsp_scratch rename also changed this comment, but in this case "old_rsp" was not referring to PER_CPU(old_rsp). Fix this. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427115839-6397-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0c91256d73df..9184392a47b3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -657,7 +657,7 @@ common_interrupt: ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ - /* 0(%rsp): rsp_scratch */ + /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF From a76c7f4604937bc781bfc411ef92c59474ddadda Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 22 Mar 2015 20:48:14 +0100 Subject: [PATCH 086/148] x86/asm/entry/64: Fold syscall32_cpu_init() into its sole user Having syscall32/sysenter32 initialization in a separate tiny function, called from within a function that is already syscall init specific, serves no real purpose. Its existense also caused an unintended effect of having wrmsrl(MSR_CSTAR) performed twice: once we set it to a dummy function returning -ENOSYS, and immediately after (if CONFIG_IA32_EMULATION), we set it to point to the proper syscall32 entry point, ia32_cstar_target. Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 31 ++++++++++--------------------- 1 file changed, 10 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d79f139871e0..66d62aef7214 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -959,24 +959,6 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif } -#ifdef CONFIG_X86_64 -# ifdef CONFIG_IA32_EMULATION -/* May not be __init: called during resume */ -static void syscall32_cpu_init(void) -{ - /* - * Always load these, in case some future 64-bit CPU supports - * SYSENTER from compat mode too: - */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} -# endif -#endif - /* * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions * on 32-bit kernels: @@ -1187,10 +1169,17 @@ void syscall_init(void) */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, system_call); +#ifndef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, ignore_sysret); - -#ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); +#else + wrmsrl(MSR_CSTAR, ia32_cstar_target); + /* + * Always load these, in case some future 64-bit CPU supports + * SYSENTER from compat mode too: + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); #endif /* Flags to clear on syscall */ From b3fe8ba320ace38cee6859b4c015d81627254ddb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 19 Mar 2015 18:17:45 +0100 Subject: [PATCH 087/148] x86/asm/entry/64: Change the THREAD_INFO() definition to not depend on KERNEL_STACK_OFFSET This changes the THREAD_INFO() definition and all its callsites so that they do not count stack position from (top of stack - KERNEL_STACK_OFFSET), but from top of stack. Semi-mysterious expressions THREAD_INFO(%rsp,RIP) - "why RIP??" are now replaced by more logical THREAD_INFO(%rsp,SIZEOF_PTREGS) - "calculate thread_info's address using information that rsp is SIZEOF_PTREGS bytes below top of stack". While at it, replace "(off)-THREAD_SIZE(reg)" with equivalent "((off)-THREAD_SIZE)(reg)". The form without parentheses falsely looks like we invoke THREAD_SIZE() macro. Improve comment atop THREAD_INFO macro definition. This patch does not change generated code (verified by objdump). Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426785469-15125-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 30 +++++++++++++++--------------- arch/x86/include/asm/thread_info.h | 8 +++++--- arch/x86/kernel/entry_64.S | 4 ++-- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index ad9efef65a6b..50190e15c1b6 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rsp,0 pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ - movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d + movl TI_sysenter_return+THREAD_INFO(%rsp,3*8),%r10d CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ @@ -159,8 +159,8 @@ ENTRY(ia32_sysenter_target) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -177,10 +177,10 @@ sysenter_dispatch: movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS(%rsp) movl RIP(%rsp),%edx /* User %eip */ @@ -225,7 +225,7 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -240,7 +240,7 @@ sysexit_from_sys_call: movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi,TI_flags+THREAD_INFO(%rsp,RIP) + testl %edi,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jz \exit CLEAR_RREGS jmp int_with_check @@ -262,7 +262,7 @@ sysenter_fix_flags: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jz sysenter_auditsys #endif SAVE_EXTRA_REGS @@ -346,8 +346,8 @@ ENTRY(ia32_cstar_target) 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -364,10 +364,10 @@ cstar_dispatch: movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx CFI_REGISTER rip,rcx @@ -402,7 +402,7 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jz cstar_auditsys #endif xchgl %r9d,%ebp @@ -469,8 +469,8 @@ ENTRY(ia32_syscall) this could be a problem. */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS_EXCEPT_R891011 - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 0abf7ab20ce2..ae9c2f13b476 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -207,10 +207,12 @@ static inline unsigned long current_stack_pointer(void) _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; /* - * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in - * a certain register (to be used in assembler memory operands). + * ASM operand which evaluates to thread_info address + * if it is known that "reg" is exactly "off" bytes below stack top. + * Example (fetch thread_info->fieldname): + * mov TI_fieldname+THREAD_INFO(reg, off),%eax */ -#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) +#define THREAD_INFO(reg, off) ((off)-THREAD_SIZE)(reg) #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9184392a47b3..8076df982a18 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -258,7 +258,7 @@ GLOBAL(system_call_after_swapgs) SAVE_C_REGS_EXCEPT_RAX_RCX_R11 movq $-ENOSYS,RAX(%rsp) CFI_REL_OFFSET rip,RIP - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -276,7 +276,7 @@ system_call_fastpath: * Has incompletely filled pt_regs, iret frame is also incomplete. */ ret_from_sys_call: - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz int_ret_from_sys_call_fixup /* Go the the slow path */ LOCKDEP_SYS_EXIT From ef593260f0cae2699874f098fb5b19fb46502cb3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 19 Mar 2015 18:17:46 +0100 Subject: [PATCH 088/148] x86/asm/entry: Get rid of KERNEL_STACK_OFFSET PER_CPU_VAR(kernel_stack) was set up in a way where it points five stack slots below the top of stack. Presumably, it was done to avoid one "sub $5*8,%rsp" in syscall/sysenter code paths, where iret frame needs to be created by hand. Ironically, none of them benefits from this optimization, since all of them need to allocate additional data on stack (struct pt_regs), so they still have to perform subtraction. This patch eliminates KERNEL_STACK_OFFSET. PER_CPU_VAR(kernel_stack) now points directly to top of stack. pt_regs allocations are adjusted to allocate iret frame as well. Hopefully we can merge it later with 32-bit specific PER_CPU_VAR(cpu_current_top_of_stack) variable... Net result in generated code is that constants in several insns are changed. This change is necessary for changing struct pt_regs creation in SYSCALL64 code path from MOV to PUSH instructions. Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426785469-15125-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 4 ++-- arch/x86/include/asm/thread_info.h | 5 ++--- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/entry_64.S | 5 ++--- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 3 +-- arch/x86/kernel/smpboot.c | 3 +-- arch/x86/xen/smp.c | 3 +-- 8 files changed, 11 insertions(+), 16 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 50190e15c1b6..acbff3fb96a1 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -311,7 +311,7 @@ ENDPROC(ia32_sysenter_target) ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK @@ -323,7 +323,7 @@ ENTRY(ia32_cstar_target) * disabled irqs and here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) - ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ + ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */ SAVE_C_REGS_EXCEPT_RCX_R891011 movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX(%rsp) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ae9c2f13b476..ad0ee3423da5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -172,7 +172,6 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define STACK_WARN (THREAD_SIZE/8) -#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) /* * macros/functions for gaining access to the thread information structure @@ -201,10 +200,10 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ -/* how to get the thread information struct from ASM */ +/* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; + _ASM_SUB $(THREAD_SIZE),reg ; /* * ASM operand which evaluates to thread_info address diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 66d62aef7214..002216ab9145 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1116,7 +1116,7 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; + (unsigned long)&init_thread_union + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8076df982a18..eae69bba3a2c 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -225,7 +225,7 @@ ENDPROC(native_usergs_sysret64) ENTRY(system_call) CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ SWAPGS_UNSAFE_STACK @@ -242,9 +242,8 @@ GLOBAL(system_call_after_swapgs) * so we can enable interrupts only after we're done with using rsp_scratch: */ movq %rsp,PER_CPU_VAR(rsp_scratch) - /* kernel_stack is set so that 5 slots (iret frame) are preallocated */ movq PER_CPU_VAR(kernel_stack),%rsp - ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ + ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */ movq %rcx,RIP(%rsp) movq PER_CPU_VAR(rsp_scratch),%rcx movq %r11,EFLAGS(%rsp) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index c5e987022ca0..8ed2106b06da 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -308,7 +308,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_sp0(tss, next); this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + THREAD_SIZE); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index da8b74598d90..4baaa972f52a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -410,8 +410,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) load_sp0(tss, next); this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE); /* * Now maybe reload the debug registers and handle I/O bitmaps diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 759388c538cf..7b20ffd2fffc 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -813,8 +813,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) initial_gs = per_cpu_offset(cpu); #endif per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) - - KERNEL_STACK_OFFSET + THREAD_SIZE; + (unsigned long)task_stack_page(idle) + THREAD_SIZE; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 08e8489c47f1..765b7684f858 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -452,8 +452,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) clear_tsk_thread_flag(idle, TIF_FORK); #endif per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) - - KERNEL_STACK_OFFSET + THREAD_SIZE; + (unsigned long)task_stack_page(idle) + THREAD_SIZE; xen_setup_runstate_info(cpu); xen_setup_timer(cpu); From 9ed8e7d86061e7c3fb3855358d51ba4abb19ceb1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 19 Mar 2015 18:17:47 +0100 Subject: [PATCH 089/148] x86/asm/entry/64: Use PUSH instructions to build pt_regs on stack With this change, on SYSCALL64 code path we are now populating pt_regs->cs, pt_regs->ss and pt_regs->rcx unconditionally and therefore don't need to do that in FIXUP_TOP_OF_STACK. We lose a number of large instructions there: text data bss dec hex filename 13298 0 0 13298 33f2 entry_64_before.o 12978 0 0 12978 32b2 entry_64.o What's more important, we convert two "MOVQ $imm,off(%rsp)" to "PUSH $imm" (the ones which fill pt_regs->cs,ss). Before this patch, placing them on fast path was slowing it down by two cycles: this form of MOV is very large, 12 bytes, and this probably reduces decode bandwidth to one instruction per cycle when CPU sees them. Therefore they were living in FIXUP_TOP_OF_STACK instead (away from fast path). "PUSH $imm" is a small 2-byte instruction. Moving it to fast path does not slow it down in my measurements. Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426785469-15125-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 54 ++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index eae69bba3a2c..3ea4f6d8bc98 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -126,11 +126,8 @@ ENDPROC(native_usergs_sysret64) * manipulation. */ .macro FIXUP_TOP_OF_STACK tmp offset=0 - movq $__USER_DS,SS+\offset(%rsp) - movq $__USER_CS,CS+\offset(%rsp) - movq RIP+\offset(%rsp),\tmp /* get rip */ - movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ - movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */ + /* copy flags to r11 as sysret would do */ + movq EFLAGS+\offset(%rsp),\tmp movq \tmp,R11+\offset(%rsp) .endm @@ -214,7 +211,6 @@ ENDPROC(native_usergs_sysret64) * r9 arg5 * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) * - * Interrupts are off on entry. * Only called from user space. * * When user can change pt_regs->foo always force IRET. That is because @@ -228,6 +224,12 @@ ENTRY(system_call) CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ SWAPGS_UNSAFE_STACK /* * A hypervisor implementation might want to use a label @@ -236,27 +238,35 @@ ENTRY(system_call) */ GLOBAL(system_call_after_swapgs) - /* - * We use 'rsp_scratch' as a scratch register, hence this block must execute - * atomically in the face of possible interrupt-driven task preemption, - * so we can enable interrupts only after we're done with using rsp_scratch: - */ movq %rsp,PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack),%rsp - ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */ - movq %rcx,RIP(%rsp) - movq PER_CPU_VAR(rsp_scratch),%rcx - movq %r11,EFLAGS(%rsp) - movq %rcx,RSP(%rsp) + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER_DS /* pt_regs->ss */ + pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* - * No need to follow this irqs off/on section - it's straight - * and short: + * Re-enable interrupts. + * We use 'rsp_scratch' as a scratch space, hence irq-off block above + * must execute atomically in the face of possible interrupt-driven + * task preemption. We must enable interrupts only after we're done + * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - movq_cfi rax,ORIG_RAX - SAVE_C_REGS_EXCEPT_RAX_RCX_R11 - movq $-ENOSYS,RAX(%rsp) - CFI_REL_OFFSET rip,RIP + pushq_cfi %r11 /* pt_regs->flags */ + pushq_cfi $__USER_CS /* pt_regs->cs */ + pushq_cfi %rcx /* pt_regs->ip */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq_cfi_reg r8 /* pt_regs->r8 */ + pushq_cfi_reg r9 /* pt_regs->r9 */ + pushq_cfi_reg r10 /* pt_regs->r10 */ + sub $(7*8),%rsp /* pt_regs->r11,bp,bx,r12-15 not saved */ + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz tracesys system_call_fastpath: From a71ffdd780760dc62c3d4cffb98eaaedaf5068b8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 19 Mar 2015 18:17:48 +0100 Subject: [PATCH 090/148] x86/asm/entry/64: Get rid of the FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK macros The FIXUP_TOP_OF_STACK macro is only necessary because we don't save %r11 to pt_regs->r11 on SYSCALL64 fast path, but we want ptrace to see it populated. Bite the bullet, add a single additional PUSH instruction, and remove the FIXUP_TOP_OF_STACK macro. The RESTORE_TOP_OF_STACK macro is already a nop. Remove it too. On SandyBridge CPU, it does not get slower: measured 54.22 ns per getpid syscall before and after last two changes on defconfig kernel. Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426785469-15125-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3ea4f6d8bc98..3f8daba28eb0 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -22,8 +22,6 @@ * - CFI macros are used to generate dwarf2 unwind information for better * backtraces. They don't change any code. * - ENTRY/END Define functions in the symbol table. - * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack - * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. * - idtentry - Define exception entry points. */ @@ -118,23 +116,6 @@ ENDPROC(native_usergs_sysret64) # define TRACE_IRQS_IRETQ_DEBUG TRACE_IRQS_IRETQ #endif -/* - * C code is not supposed to know that the iret frame is not populated. - * Every time a C function with an pt_regs argument is called from - * the SYSCALL based fast path FIXUP_TOP_OF_STACK is needed. - * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs - * manipulation. - */ - .macro FIXUP_TOP_OF_STACK tmp offset=0 - /* copy flags to r11 as sysret would do */ - movq EFLAGS+\offset(%rsp),\tmp - movq \tmp,R11+\offset(%rsp) - .endm - - .macro RESTORE_TOP_OF_STACK tmp offset=0 - /* nothing to do */ - .endm - /* * empty frame */ @@ -265,7 +246,8 @@ GLOBAL(system_call_after_swapgs) pushq_cfi_reg r8 /* pt_regs->r8 */ pushq_cfi_reg r9 /* pt_regs->r9 */ pushq_cfi_reg r10 /* pt_regs->r10 */ - sub $(7*8),%rsp /* pt_regs->r11,bp,bx,r12-15 not saved */ + pushq_cfi_reg r11 /* pt_regs->r11 */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) jnz tracesys @@ -312,7 +294,6 @@ ret_from_sys_call: CFI_RESTORE_STATE int_ret_from_sys_call_fixup: - FIXUP_TOP_OF_STACK %r11 jmp int_ret_from_sys_call /* Do syscall entry tracing */ @@ -328,7 +309,6 @@ tracesys: tracesys_phase2: SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %rdi movq %rsp, %rdi movq $AUDIT_ARCH_X86_64, %rsi movq %rax,%rdx @@ -421,9 +401,7 @@ ENTRY(stub_\func) CFI_STARTPROC DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 - FIXUP_TOP_OF_STACK %r11, 8 call sys_\func - RESTORE_TOP_OF_STACK %r11, 8 ret CFI_ENDPROC END(stub_\func) @@ -438,7 +416,6 @@ ENTRY(stub_execve) addq $8, %rsp DEFAULT_FRAME 0 SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %r11 call sys_execve movq %rax,RAX(%rsp) RESTORE_EXTRA_REGS @@ -451,9 +428,7 @@ ENTRY(stub_execveat) addq $8, %rsp DEFAULT_FRAME 0 SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %r11 call sys_execveat - RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call @@ -469,7 +444,6 @@ ENTRY(stub_rt_sigreturn) addq $8, %rsp DEFAULT_FRAME 0 SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %r11 call sys_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer RESTORE_EXTRA_REGS @@ -483,7 +457,6 @@ ENTRY(stub_x32_rt_sigreturn) addq $8, %rsp DEFAULT_FRAME 0 SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %r11 call sys32_x32_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer RESTORE_EXTRA_REGS @@ -496,9 +469,7 @@ ENTRY(stub_x32_execve) addq $8, %rsp DEFAULT_FRAME 0 SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %r11 call compat_sys_execve - RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call @@ -510,9 +481,7 @@ ENTRY(stub_x32_execveat) addq $8, %rsp DEFAULT_FRAME 0 SAVE_EXTRA_REGS - FIXUP_TOP_OF_STACK %r11 call compat_sys_execveat - RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) RESTORE_EXTRA_REGS jmp int_ret_from_sys_call From 65c2377486c0b68f149f7d8770499a86b15786b6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 19 Mar 2015 18:17:49 +0100 Subject: [PATCH 091/148] x86/asm/entry/64: Get rid of int_ret_from_sys_call_fixup With the FIXUP_TOP_OF_STACK macro removed, this intermediate jump is unnecessary. Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1426785469-15125-5-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3f8daba28eb0..df04ee069b1f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -268,7 +268,7 @@ system_call_fastpath: */ ret_from_sys_call: testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) - jnz int_ret_from_sys_call_fixup /* Go the the slow path */ + jnz int_ret_from_sys_call /* Go the slow path */ LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_NONE) @@ -293,9 +293,6 @@ ret_from_sys_call: CFI_RESTORE_STATE -int_ret_from_sys_call_fixup: - jmp int_ret_from_sys_call - /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi From 84f53788458c95309b88948b69ff95921e9c74a8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sun, 22 Mar 2015 22:01:12 +0100 Subject: [PATCH 092/148] x86/asm: Deobfuscate segment.h This file just defines a number of constants, and a few macros and inline functions. It is particularly badly written. For example, it is not trivial to see how descriptors are numbered (you'd expect that should be easy, right?). This change deobfuscates it via the following changes: Group all GDT_ENTRY_foo together (move intervening stuff away). Number them explicitly: use a number, not PREV_DEFINE+1, +2, +3: I want to immediately see that GDT_ENTRY_PNPBIOS_CS32 is 18. Seeing (GDT_ENTRY_KERNEL_BASE+6) instead is not useful. The above change allows to remove GDT_ENTRY_KERNEL_BASE and GDT_ENTRY_PNPBIOS_BASE, which weren't used anywhere else. After a group of GDT_ENTRY_foo, define all selector values. Remove or improve some comments. In particular: Comment deleted as stating the obvious: /* * The GDT has 32 entries */ #define GDT_ENTRIES 32 "The segment offset needs to contain a RPL. Grr. -AK" changed to "Selectors need to also have a correct RPL (+3 thingy)" "GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets)" expanded into a description *how exactly* sysret hardcodes them. Patch was tested to compile and not change vmlinux.o on 32-bit and 64-bit builds (verified with objdump). Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 125 ++++++++++++++------------------- 1 file changed, 54 insertions(+), 71 deletions(-) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index db257a58571f..117028f58882 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -15,12 +15,10 @@ /* Simple and small GDT entries for booting only */ #define GDT_ENTRY_BOOT_CS 2 +#define GDT_ENTRY_BOOT_DS 3 +#define GDT_ENTRY_BOOT_TSS 4 #define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) - -#define GDT_ENTRY_BOOT_DS (GDT_ENTRY_BOOT_CS + 1) #define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) - -#define GDT_ENTRY_BOOT_TSS (GDT_ENTRY_BOOT_CS + 2) #define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) #define SEGMENT_RPL_MASK 0x3 /* @@ -80,97 +78,86 @@ #define GDT_ENTRY_TLS_MIN 6 #define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_KERNEL_CS 12 +#define GDT_ENTRY_KERNEL_DS 13 #define GDT_ENTRY_DEFAULT_USER_CS 14 - #define GDT_ENTRY_DEFAULT_USER_DS 15 +#define GDT_ENTRY_TSS 16 +#define GDT_ENTRY_LDT 17 +#define GDT_ENTRY_PNPBIOS_CS32 18 +#define GDT_ENTRY_PNPBIOS_CS16 19 +#define GDT_ENTRY_PNPBIOS_DS 20 +#define GDT_ENTRY_PNPBIOS_TS1 21 +#define GDT_ENTRY_PNPBIOS_TS2 22 +#define GDT_ENTRY_APMBIOS_BASE 23 -#define GDT_ENTRY_KERNEL_BASE (12) +#define GDT_ENTRY_ESPFIX_SS 26 +#define GDT_ENTRY_PERCPU 27 +#define GDT_ENTRY_STACK_CANARY 28 -#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0) +#define GDT_ENTRY_DOUBLEFAULT_TSS 31 -#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1) - -#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4) -#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5) - -#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6) -#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11) - -#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) - -#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15) +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ +/* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ #ifdef CONFIG_SMP -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) +#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else -#define __KERNEL_PERCPU 0 +#define __KERNEL_PERCPU 0 #endif - -#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16) #ifdef CONFIG_CC_STACKPROTECTOR #define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) #else #define __KERNEL_STACK_CANARY 0 #endif -#define GDT_ENTRY_DOUBLEFAULT_TSS 31 - -/* - * The GDT has 32 entries - */ #define GDT_ENTRIES 32 -/* The PnP BIOS entries in the GDT */ -#define GDT_ENTRY_PNPBIOS_CS32 (GDT_ENTRY_PNPBIOS_BASE + 0) -#define GDT_ENTRY_PNPBIOS_CS16 (GDT_ENTRY_PNPBIOS_BASE + 1) -#define GDT_ENTRY_PNPBIOS_DS (GDT_ENTRY_PNPBIOS_BASE + 2) -#define GDT_ENTRY_PNPBIOS_TS1 (GDT_ENTRY_PNPBIOS_BASE + 3) -#define GDT_ENTRY_PNPBIOS_TS2 (GDT_ENTRY_PNPBIOS_BASE + 4) +#else /* 64-bit: */ -/* The PnP BIOS selectors */ -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ - - -/* - * Matching rules for certain types of segments. - */ - -/* Matches PNP_CS32 and PNP_CS16 (they must be consecutive) */ -#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == GDT_ENTRY_PNPBIOS_BASE * 8) - - -#else #include -#define GDT_ENTRY_KERNEL32_CS 1 -#define GDT_ENTRY_KERNEL_CS 2 -#define GDT_ENTRY_KERNEL_DS 3 - -#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS * 8) - +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 /* * we cannot use the same code segment descriptor for user and kernel * -- not even in the long flat mode, because of different DPL /kkeil - * The segment offset needs to contain a RPL. Grr. -AK - * GDT layout to get 64bit syscall right (sysret hardcodes gdt offsets) + * GDT layout to get 64bit syscall/sysret right. sysret hardcodes selectors: + * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, + * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, + * ss = STAR.SYSRET_CS+8 (in either case) + * thus USER_DS should be between 32-bit and 64-bit code selectors: */ #define GDT_ENTRY_DEFAULT_USER32_CS 4 #define GDT_ENTRY_DEFAULT_USER_DS 5 #define GDT_ENTRY_DEFAULT_USER_CS 6 -#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) + +#define GDT_ENTRY_TSS 8 /* needs two entries */ +#define GDT_ENTRY_LDT 10 /* needs two entries */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 + +#define GDT_ENTRY_PER_CPU 15 /* abused to load per CPU data from limit */ + +/* Selectors need to also have a correct RPL (+3 thingy) */ +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) #define __USER32_DS __USER_DS - -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 /* needs two entries */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 - -#define GDT_ENTRY_PER_CPU 15 /* Abused to load per CPU data from limit */ -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU * 8 + 3) +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8+3) /* TLS indexes for 64bit - hardcoded in arch_prctl */ #define FS_TLS 0 @@ -183,10 +170,6 @@ #endif -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) #ifndef CONFIG_PARAVIRT #define get_kernel_rpl() 0 #endif From d56fe4bf5f3cc30d455c80520f7b71da36ae00e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Mar 2015 14:41:37 +0100 Subject: [PATCH 093/148] x86/asm/entry/64: Always set up SYSENTER MSRs On CONFIG_IA32_EMULATION=y kernels we set up MSR_IA32_SYSENTER_CS/ESP/EIP, but on !CONFIG_IA32_EMULATION kernels we leave them unchanged. Clear them to make sure the instruction is disabled properly. SYSCALL is set up properly in both cases. Acked-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 002216ab9145..c928a7ae1099 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1169,9 +1169,8 @@ void syscall_init(void) */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, system_call); -#ifndef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, ignore_sysret); -#else + +#ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, ia32_cstar_target); /* * Always load these, in case some future 64-bit CPU supports @@ -1180,6 +1179,11 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); +#else + wrmsrl(MSR_CSTAR, ignore_sysret); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif /* Flags to clear on syscall */ From 1ddc6f3c60d75a7577dd33bc441e309febe2fc76 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Mar 2015 19:43:11 +0100 Subject: [PATCH 094/148] x86/asm/entry/64: Improve the THREAD_INFO() macro explanation Explain the background, and add a real example. Acked-by: Denys Vlasenko Acked-by: Andy Lutomirski Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/20150324184311.GA14760@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/thread_info.h | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index ad0ee3423da5..813dfbb867a7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -206,10 +206,29 @@ static inline unsigned long current_stack_pointer(void) _ASM_SUB $(THREAD_SIZE),reg ; /* - * ASM operand which evaluates to thread_info address - * if it is known that "reg" is exactly "off" bytes below stack top. - * Example (fetch thread_info->fieldname): - * mov TI_fieldname+THREAD_INFO(reg, off),%eax + * ASM operand which evaluates to a 'thread_info' address of + * the current task, if it is known that "reg" is exactly "off" + * bytes below the top of the stack currently. + * + * ( The kernel stack's size is known at build time, it is usually + * 2 or 4 pages, and the bottom of the kernel stack contains + * the thread_info structure. So to access the thread_info very + * quickly from assembly code we can calculate down from the + * top of the kernel stack to the bottom, using constant, + * build-time calculations only. ) + * + * For example, to fetch the current thread_info->flags value into %eax + * on x86-64 defconfig kernels, in syscall entry code where RSP is + * currently at exactly SIZEOF_PTREGS bytes away from the top of the + * stack: + * + * mov TI_flags+THREAD_INFO(%rsp, SIZEOF_PTREGS), %eax + * + * will translate to: + * + * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax + * + * which is below the current RSP by almost 16K. */ #define THREAD_INFO(reg, off) ((off)-THREAD_SIZE)(reg) From f9d71854b4fe9b22ca199c4676da5a6ece1e5c17 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Mar 2015 19:44:11 +0100 Subject: [PATCH 095/148] x86/asm/entry/64: Merge the field offset into the THREAD_INFO() macro Before: TI_sysenter_return+THREAD_INFO(%rsp,3*8),%r10d After: movl THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d to turn it into a clear thread_info accessor. No code changed: md5: fb4cb2b3ce05d89940ca304efc8ff183 ia32entry.o.before.asm fb4cb2b3ce05d89940ca304efc8ff183 ia32entry.o.after.asm e39f2958a5d1300158e276e4f7663263 entry_64.o.before.asm e39f2958a5d1300158e276e4f7663263 entry_64.o.after.asm Acked-by: Andy Lutomirski Acked-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/20150324184411.GB14760@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 30 +++++++++++++++--------------- arch/x86/include/asm/thread_info.h | 4 ++-- arch/x86/kernel/entry_64.S | 4 ++-- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index acbff3fb96a1..32e94aec6073 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rsp,0 pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ - movl TI_sysenter_return+THREAD_INFO(%rsp,3*8),%r10d + movl THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ @@ -159,8 +159,8 @@ ENTRY(ia32_sysenter_target) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -177,10 +177,10 @@ sysenter_dispatch: movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) + andl $~TS_COMPAT,THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS(%rsp) movl RIP(%rsp),%edx /* User %eip */ @@ -225,7 +225,7 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -240,7 +240,7 @@ sysexit_from_sys_call: movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl %edi, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz \exit CLEAR_RREGS jmp int_with_check @@ -262,7 +262,7 @@ sysenter_fix_flags: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz sysenter_auditsys #endif SAVE_EXTRA_REGS @@ -346,8 +346,8 @@ ENTRY(ia32_cstar_target) 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -364,10 +364,10 @@ cstar_dispatch: movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) + andl $~TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx CFI_REGISTER rip,rcx @@ -402,7 +402,7 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif xchgl %r9d,%ebp @@ -469,8 +469,8 @@ ENTRY(ia32_syscall) this could be a problem. */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS_EXCEPT_R891011 - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 813dfbb867a7..224285b674ca 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -222,7 +222,7 @@ static inline unsigned long current_stack_pointer(void) * currently at exactly SIZEOF_PTREGS bytes away from the top of the * stack: * - * mov TI_flags+THREAD_INFO(%rsp, SIZEOF_PTREGS), %eax + * mov THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax * * will translate to: * @@ -230,7 +230,7 @@ static inline unsigned long current_stack_pointer(void) * * which is below the current RSP by almost 16K. */ -#define THREAD_INFO(reg, off) ((off)-THREAD_SIZE)(reg) +#define THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index df04ee069b1f..8f01a4f1cf9e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -249,7 +249,7 @@ GLOBAL(system_call_after_swapgs) pushq_cfi_reg r11 /* pt_regs->r11 */ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -267,7 +267,7 @@ system_call_fastpath: * Has incompletely filled pt_regs, iret frame is also incomplete. */ ret_from_sys_call: - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,SIZEOF_PTREGS) + testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz int_ret_from_sys_call /* Go the slow path */ LOCKDEP_SYS_EXIT From dca5b52ad76b10c3adc29e2a006d4b1721c44a8d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Mar 2015 19:44:42 +0100 Subject: [PATCH 096/148] x86/asm/entry/64: Rename THREAD_INFO() to ASM_THREAD_INFO() The THREAD_INFO() macro has a somewhat confusingly generic name, defined in a generic .h C header file. It also does not make it clear that it constructs a memory operand for use in assembly code. Rename it to ASM_THREAD_INFO() to make it all glaringly obvious on first glance. Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/20150324184442.GC14760@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 30 +++++++++++++++--------------- arch/x86/include/asm/thread_info.h | 4 ++-- arch/x86/kernel/entry_64.S | 4 ++-- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 32e94aec6073..5d2641ce9957 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -127,7 +127,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rsp,0 pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ - movl THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ @@ -159,8 +159,8 @@ ENTRY(ia32_sysenter_target) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -177,10 +177,10 @@ sysenter_dispatch: movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS(%rsp) movl RIP(%rsp),%edx /* User %eip */ @@ -225,7 +225,7 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -240,7 +240,7 @@ sysexit_from_sys_call: movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz \exit CLEAR_RREGS jmp int_with_check @@ -262,7 +262,7 @@ sysenter_fix_flags: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz sysenter_auditsys #endif SAVE_EXTRA_REGS @@ -346,8 +346,8 @@ ENTRY(ia32_cstar_target) 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC - orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -364,10 +364,10 @@ cstar_dispatch: movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) RESTORE_RSI_RDI_RDX movl RIP(%rsp),%ecx CFI_REGISTER rip,rcx @@ -402,7 +402,7 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jz cstar_auditsys #endif xchgl %r9d,%ebp @@ -469,8 +469,8 @@ ENTRY(ia32_syscall) this could be a problem. */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS_EXCEPT_R891011 - orl $TS_COMPAT, THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 224285b674ca..ea2dbe82cba3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -222,7 +222,7 @@ static inline unsigned long current_stack_pointer(void) * currently at exactly SIZEOF_PTREGS bytes away from the top of the * stack: * - * mov THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax + * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax * * will translate to: * @@ -230,7 +230,7 @@ static inline unsigned long current_stack_pointer(void) * * which is below the current RSP by almost 16K. */ -#define THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) +#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) #endif diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 8f01a4f1cf9e..daf5d94c0e78 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -249,7 +249,7 @@ GLOBAL(system_call_after_swapgs) pushq_cfi_reg r11 /* pt_regs->r11 */ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ - testl $_TIF_WORK_SYSCALL_ENTRY, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -267,7 +267,7 @@ system_call_fastpath: * Has incompletely filled pt_regs, iret frame is also incomplete. */ ret_from_sys_call: - testl $_TIF_ALLWORK_MASK, THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz int_ret_from_sys_call /* Go the slow path */ LOCKDEP_SYS_EXIT From 72d64cc76941cde45e65e2a5b9fb81d527963645 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 24 Mar 2015 20:45:42 +0100 Subject: [PATCH 097/148] x86/asm: Further improve segment.h readability - extend/clarify explanations where necessary - move comments from macro values to before the macro, to make them more consistent, and to reduce preprocessor overhead - sort GDT index and selector values likewise by number - use consistent, modern kernel coding style across the file - capitalize consistently - use consistent vertical spacing - remove the unused get_limit() method (noticed by Andy Lutomirski) No change in code (verified with objdump -d): 64-bit defconfig+kvmconfig: 815a129bc1f80de6445c1d8ca5b97cad vmlinux.o.before.asm 815a129bc1f80de6445c1d8ca5b97cad vmlinux.o.after.asm 32-bit defconfig+kvmconfig: e659ef045159ddf41a0771b33a34aae5 vmlinux.o.before.asm e659ef045159ddf41a0771b33a34aae5 vmlinux.o.after.asm Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 244 +++++++++++++++++++-------------- 1 file changed, 142 insertions(+), 102 deletions(-) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 117028f58882..d394899e055c 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -3,8 +3,10 @@ #include -/* Constructor for a conventional segment GDT (or LDT) entry */ -/* This is a macro so it can be used in initializers */ +/* + * Constructor for a conventional segment GDT (or LDT) entry. + * This is a macro so it can be used in initializers. + */ #define GDT_ENTRY(flags, base, limit) \ ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \ (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \ @@ -12,71 +14,78 @@ (((base) & _AC(0x00ffffff,ULL)) << 16) | \ (((limit) & _AC(0x0000ffff,ULL)))) -/* Simple and small GDT entries for booting only */ +/* Simple and small GDT entries for booting only: */ #define GDT_ENTRY_BOOT_CS 2 #define GDT_ENTRY_BOOT_DS 3 #define GDT_ENTRY_BOOT_TSS 4 -#define __BOOT_CS (GDT_ENTRY_BOOT_CS * 8) -#define __BOOT_DS (GDT_ENTRY_BOOT_DS * 8) -#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS * 8) +#define __BOOT_CS (GDT_ENTRY_BOOT_CS*8) +#define __BOOT_DS (GDT_ENTRY_BOOT_DS*8) +#define __BOOT_TSS (GDT_ENTRY_BOOT_TSS*8) -#define SEGMENT_RPL_MASK 0x3 /* - * Bottom two bits of selector give the ring - * privilege level - */ -#define SEGMENT_TI_MASK 0x4 /* Bit 2 is table indicator (LDT/GDT) */ -#define USER_RPL 0x3 /* User mode is privilege level 3 */ -#define SEGMENT_LDT 0x4 /* LDT segment has TI set... */ -#define SEGMENT_GDT 0x0 /* ... GDT has it cleared */ +/* + * Bottom two bits of selector give the ring + * privilege level + */ +#define SEGMENT_RPL_MASK 0x3 + +/* User mode is privilege level 3: */ +#define USER_RPL 0x3 + +/* Bit 2 is Table Indicator (TI): selects between LDT or GDT */ +#define SEGMENT_TI_MASK 0x4 +/* LDT segment has TI set ... */ +#define SEGMENT_LDT 0x4 +/* ... GDT has it cleared */ +#define SEGMENT_GDT 0x0 #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: * - * 0 - null + * 0 - null <=== cacheline #1 * 1 - reserved * 2 - reserved * 3 - reserved * - * 4 - unused <==== new cacheline + * 4 - unused <=== cacheline #2 * 5 - unused * * ------- start of TLS (Thread-Local Storage) segments: * * 6 - TLS segment #1 [ glibc's TLS segment ] * 7 - TLS segment #2 [ Wine's %fs Win32 segment ] - * 8 - TLS segment #3 + * 8 - TLS segment #3 <=== cacheline #3 * 9 - reserved * 10 - reserved * 11 - reserved * * ------- start of kernel segments: * - * 12 - kernel code segment <==== new cacheline + * 12 - kernel code segment <=== cacheline #4 * 13 - kernel data segment * 14 - default user CS * 15 - default user DS - * 16 - TSS + * 16 - TSS <=== cacheline #5 * 17 - LDT * 18 - PNPBIOS support (16->32 gate) * 19 - PNPBIOS support - * 20 - PNPBIOS support + * 20 - PNPBIOS support <=== cacheline #6 * 21 - PNPBIOS support * 22 - PNPBIOS support * 23 - APM BIOS support - * 24 - APM BIOS support + * 24 - APM BIOS support <=== cacheline #7 * 25 - APM BIOS support * * 26 - ESPFIX small SS * 27 - per-cpu [ offset to per-cpu data area ] - * 28 - stack_canary-20 [ for stack protector ] + * 28 - stack_canary-20 [ for stack protector ] <=== cacheline #8 * 29 - unused * 30 - unused * 31 - TSS for double fault handler */ -#define GDT_ENTRY_TLS_MIN 6 -#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) +#define GDT_ENTRY_TLS_MIN 6 +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) #define GDT_ENTRY_KERNEL_CS 12 #define GDT_ENTRY_KERNEL_DS 13 @@ -97,96 +106,134 @@ #define GDT_ENTRY_DOUBLEFAULT_TSS 31 +/* + * Number of entries in the GDT table: + */ +#define GDT_ENTRIES 32 + +/* + * Segment selector values corresponding to the above entries: + */ + #define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) #define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8) -#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32 * 8) /* segment for calling fn */ -#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16 * 8) /* code segment for BIOS */ + +/* segment for calling fn: */ +#define PNP_CS32 (GDT_ENTRY_PNPBIOS_CS32*8) +/* code segment for BIOS: */ +#define PNP_CS16 (GDT_ENTRY_PNPBIOS_CS16*8) + /* "Is this PNP code selector (PNP_CS32 or PNP_CS16)?" */ -#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) -#define PNP_DS (GDT_ENTRY_PNPBIOS_DS * 8) /* data segment for BIOS */ -#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1 * 8) /* transfer data segment */ -#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2 * 8) /* another data segment */ +#define SEGMENT_IS_PNP_CODE(x) (((x) & 0xf4) == PNP_CS32) + +/* data segment for BIOS: */ +#define PNP_DS (GDT_ENTRY_PNPBIOS_DS*8) +/* transfer data segment: */ +#define PNP_TS1 (GDT_ENTRY_PNPBIOS_TS1*8) +/* another data segment: */ +#define PNP_TS2 (GDT_ENTRY_PNPBIOS_TS2*8) + #ifdef CONFIG_SMP -#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) +# define __KERNEL_PERCPU (GDT_ENTRY_PERCPU*8) #else -#define __KERNEL_PERCPU 0 -#endif -#ifdef CONFIG_CC_STACKPROTECTOR -#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) -#else -#define __KERNEL_STACK_CANARY 0 +# define __KERNEL_PERCPU 0 #endif -#define GDT_ENTRIES 32 +#ifdef CONFIG_CC_STACKPROTECTOR +# define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8) +#else +# define __KERNEL_STACK_CANARY 0 +#endif #else /* 64-bit: */ #include -#define GDT_ENTRY_KERNEL32_CS 1 -#define GDT_ENTRY_KERNEL_CS 2 -#define GDT_ENTRY_KERNEL_DS 3 +#define GDT_ENTRY_KERNEL32_CS 1 +#define GDT_ENTRY_KERNEL_CS 2 +#define GDT_ENTRY_KERNEL_DS 3 + /* - * we cannot use the same code segment descriptor for user and kernel - * -- not even in the long flat mode, because of different DPL /kkeil - * GDT layout to get 64bit syscall/sysret right. sysret hardcodes selectors: - * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, - * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, + * We cannot use the same code segment descriptor for user and kernel mode, + * not even in long flat mode, because of different DPL. + * + * GDT layout to get 64-bit SYSCALL/SYSRET support right. SYSRET hardcodes + * selectors: + * + * if returning to 32-bit userspace: cs = STAR.SYSRET_CS, + * if returning to 64-bit userspace: cs = STAR.SYSRET_CS+16, + * * ss = STAR.SYSRET_CS+8 (in either case) + * * thus USER_DS should be between 32-bit and 64-bit code selectors: */ -#define GDT_ENTRY_DEFAULT_USER32_CS 4 -#define GDT_ENTRY_DEFAULT_USER_DS 5 -#define GDT_ENTRY_DEFAULT_USER_CS 6 +#define GDT_ENTRY_DEFAULT_USER32_CS 4 +#define GDT_ENTRY_DEFAULT_USER_DS 5 +#define GDT_ENTRY_DEFAULT_USER_CS 6 -#define GDT_ENTRY_TSS 8 /* needs two entries */ -#define GDT_ENTRY_LDT 10 /* needs two entries */ -#define GDT_ENTRY_TLS_MIN 12 -#define GDT_ENTRY_TLS_MAX 14 +/* Needs two entries */ +#define GDT_ENTRY_TSS 8 +/* Needs two entries */ +#define GDT_ENTRY_LDT 10 -#define GDT_ENTRY_PER_CPU 15 /* abused to load per CPU data from limit */ +#define GDT_ENTRY_TLS_MIN 12 +#define GDT_ENTRY_TLS_MAX 14 -/* Selectors need to also have a correct RPL (+3 thingy) */ -#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) -#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) -#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3) -#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3) -#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) -#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8+3) -#define __USER32_DS __USER_DS -#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8+3) +/* Abused to load per CPU data from limit */ +#define GDT_ENTRY_PER_CPU 15 -/* TLS indexes for 64bit - hardcoded in arch_prctl */ -#define FS_TLS 0 -#define GS_TLS 1 +/* + * Number of entries in the GDT table: + */ +#define GDT_ENTRIES 16 -#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) -#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) +/* + * Segment selector values corresponding to the above entries: + * + * Note, selectors also need to have a correct RPL, + * expressed with the +3 value for user-space selectors: + */ +#define __KERNEL32_CS (GDT_ENTRY_KERNEL32_CS*8) +#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8) +#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8) +#define __USER32_CS (GDT_ENTRY_DEFAULT_USER32_CS*8 + 3) +#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8 + 3) +#define __USER32_DS __USER_DS +#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8 + 3) +#define __PER_CPU_SEG (GDT_ENTRY_PER_CPU*8 + 3) -#define GDT_ENTRIES 16 +/* TLS indexes for 64-bit - hardcoded in arch_prctl(): */ +#define FS_TLS 0 +#define GS_TLS 1 + +#define GS_TLS_SEL ((GDT_ENTRY_TLS_MIN+GS_TLS)*8 + 3) +#define FS_TLS_SEL ((GDT_ENTRY_TLS_MIN+FS_TLS)*8 + 3) #endif #ifndef CONFIG_PARAVIRT -#define get_kernel_rpl() 0 +# define get_kernel_rpl() 0 #endif -#define IDT_ENTRIES 256 -#define NUM_EXCEPTION_VECTORS 32 -/* Bitmask of exception vectors which push an error code on the stack */ -#define EXCEPTION_ERRCODE_MASK 0x00027d00 -#define GDT_SIZE (GDT_ENTRIES * 8) -#define GDT_ENTRY_TLS_ENTRIES 3 -#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8) +#define IDT_ENTRIES 256 +#define NUM_EXCEPTION_VECTORS 32 + +/* Bitmask of exception vectors which push an error code on the stack: */ +#define EXCEPTION_ERRCODE_MASK 0x00027d00 + +#define GDT_SIZE (GDT_ENTRIES*8) +#define GDT_ENTRY_TLS_ENTRIES 3 +#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES* 8) #ifdef __KERNEL__ #ifndef __ASSEMBLY__ + extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][2+2+5]; #ifdef CONFIG_TRACING -#define trace_early_idt_handlers early_idt_handlers +# define trace_early_idt_handlers early_idt_handlers #endif /* @@ -211,37 +258,30 @@ do { \ } while (0) /* - * Save a segment register away + * Save a segment register away: */ #define savesegment(seg, value) \ asm("mov %%" #seg ",%0":"=r" (value) : : "memory") /* - * x86_32 user gs accessors. + * x86-32 user GS accessors: */ #ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ +# ifdef CONFIG_X86_32_LAZY_GS +# define get_user_gs(regs) (u16)({ unsigned long v; savesegment(gs, v); v; }) +# define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +# define task_user_gs(tsk) ((tsk)->thread.gs) +# define lazy_save_gs(v) savesegment(gs, (v)) +# define lazy_load_gs(v) loadsegment(gs, (v)) +# else /* X86_32_LAZY_GS */ +# define get_user_gs(regs) (u16)((regs)->gs) +# define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +# define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +# define lazy_save_gs(v) do { } while (0) +# define lazy_load_gs(v) do { } while (0) +# endif /* X86_32_LAZY_GS */ #endif /* X86_32 */ -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ From 146b2b097d7a322b64b88a927fc5d870fc79a60b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 25 Mar 2015 18:18:13 +0100 Subject: [PATCH 098/148] x86/asm/entry/64: Use better label name, fix comments A named label "ret_from_sys_call" implies that there are jumps to this location from elsewhere, as happens with many other labels in this file. But this label is used only by the JMP a few insns above. To make that obvious, use local numeric label instead. Improve comments: "and return regs->ax" isn't too informative. We always return regs->ax. The comment suggesting that it'd be cool to use rip relative addressing for CALL is deleted. It's unclear why that would be an improvement - we aren't striving to use position-independent code here. PIC code here would require something like LEA sys_call_table(%rip),reg + CALL *(reg,%rax*8)... "iret frame is also incomplete" is no longer true, fix that too. Also fix typo in comment. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427303896-24023-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bf9afadbb99e..9988c4b2de33 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -258,16 +258,15 @@ system_call_fastpath: andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja ret_from_sys_call /* and return regs->ax */ + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10,%rcx - call *sys_call_table(,%rax,8) # XXX: rip relative + call *sys_call_table(,%rax,8) movq %rax,RAX(%rsp) +1: /* - * Syscall return path ending with SYSRET (fast path) - * Has incompletely filled pt_regs, iret frame is also incomplete. + * Syscall return path ending with SYSRET (fast path). + * Has incompletely filled pt_regs. */ -ret_from_sys_call: - LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -1407,7 +1406,7 @@ ENTRY(nmi) * NMI. */ - /* Use %rdx as out temp variable throughout */ + /* Use %rdx as our temp variable throughout */ pushq_cfi %rdx CFI_REL_OFFSET rdx, 0 From 47eb582e702880c302036d17341c7ea1a7dc2a53 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 25 Mar 2015 18:18:15 +0100 Subject: [PATCH 099/148] x86/asm/entry/64: Use smaller instructions The $AUDIT_ARCH_X86_64 parameter to syscall_trace_enter_phase1/2 is a 32-bit constant, loading it with 32-bit MOV produces 5-byte insn instead of 10-byte MOVABS one. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427303896-24023-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9988c4b2de33..f85d2ccec6d2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -305,7 +305,7 @@ system_call_fastpath: /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi - movq $AUDIT_ARCH_X86_64, %rsi + movl $AUDIT_ARCH_X86_64, %esi call syscall_trace_enter_phase1 test %rax, %rax jnz tracesys_phase2 /* if needed, run the slow path */ @@ -316,7 +316,7 @@ tracesys: tracesys_phase2: SAVE_EXTRA_REGS movq %rsp, %rdi - movq $AUDIT_ARCH_X86_64, %rsi + movl $AUDIT_ARCH_X86_64, %esi movq %rax,%rdx call syscall_trace_enter_phase2 From 40e2ec657dcb0ae328db1abc8e37df4caa893391 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 25 Mar 2015 21:14:26 +0100 Subject: [PATCH 100/148] x86/irq/tracing: Move ARCH_LOCKDEP_SYS_EXIT defines closer to their users This change simply moves defines around (even if it's not obvious in a patch form). Nothing is changed. This is a preparation for folding ARCH_LOCKDEP_SYS_EXIT defines into their users. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427314468-12763-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irqflags.h | 51 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 021bee9b86b6..55866c26d447 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -163,33 +163,9 @@ static inline int arch_irqs_disabled(void) return arch_irqs_disabled_flags(flags); } +#endif /* !__ASSEMBLY__ */ -#else - -#ifdef CONFIG_X86_64 -#define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -#define ARCH_LOCKDEP_SYS_EXIT_IRQ \ - TRACE_IRQS_ON; \ - sti; \ - SAVE_EXTRA_REGS; \ - LOCKDEP_SYS_EXIT; \ - RESTORE_EXTRA_REGS; \ - cli; \ - TRACE_IRQS_OFF; - -#else -#define ARCH_LOCKDEP_SYS_EXIT \ - pushl %eax; \ - pushl %ecx; \ - pushl %edx; \ - call lockdep_sys_exit; \ - popl %edx; \ - popl %ecx; \ - popl %eax; - -#define ARCH_LOCKDEP_SYS_EXIT_IRQ -#endif - +#ifdef __ASSEMBLY__ #ifdef CONFIG_TRACE_IRQFLAGS # define TRACE_IRQS_ON call trace_hardirqs_on_thunk; # define TRACE_IRQS_OFF call trace_hardirqs_off_thunk; @@ -198,12 +174,33 @@ static inline int arch_irqs_disabled(void) # define TRACE_IRQS_OFF #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC +# ifdef CONFIG_X86_64 +# define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +# define ARCH_LOCKDEP_SYS_EXIT_IRQ \ + TRACE_IRQS_ON; \ + sti; \ + SAVE_EXTRA_REGS; \ + LOCKDEP_SYS_EXIT; \ + RESTORE_EXTRA_REGS; \ + cli; \ + TRACE_IRQS_OFF; +# else +# define ARCH_LOCKDEP_SYS_EXIT \ + pushl %eax; \ + pushl %ecx; \ + pushl %edx; \ + call lockdep_sys_exit; \ + popl %edx; \ + popl %ecx; \ + popl %eax; +# define ARCH_LOCKDEP_SYS_EXIT_IRQ +# endif # define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT # define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ # else # define LOCKDEP_SYS_EXIT # define LOCKDEP_SYS_EXIT_IRQ # endif - #endif /* __ASSEMBLY__ */ + #endif From 7dc7cc0780b04935f1127fa22ee23e9d6daf166a Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 25 Mar 2015 21:14:27 +0100 Subject: [PATCH 101/148] x86/irq/tracing: Fold ARCH_LOCKDEP_SYS_EXIT defines into their users There is no need to have an extra level of macro indirection here. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427314468-12763-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irqflags.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 55866c26d447..19355f34c4a3 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -175,17 +175,17 @@ static inline int arch_irqs_disabled(void) #endif #ifdef CONFIG_DEBUG_LOCK_ALLOC # ifdef CONFIG_X86_64 -# define ARCH_LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk -# define ARCH_LOCKDEP_SYS_EXIT_IRQ \ +# define LOCKDEP_SYS_EXIT call lockdep_sys_exit_thunk +# define LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ SAVE_EXTRA_REGS; \ - LOCKDEP_SYS_EXIT; \ + call lockdep_sys_exit_thunk; \ RESTORE_EXTRA_REGS; \ cli; \ TRACE_IRQS_OFF; # else -# define ARCH_LOCKDEP_SYS_EXIT \ +# define LOCKDEP_SYS_EXIT \ pushl %eax; \ pushl %ecx; \ pushl %edx; \ @@ -193,14 +193,12 @@ static inline int arch_irqs_disabled(void) popl %edx; \ popl %ecx; \ popl %eax; -# define ARCH_LOCKDEP_SYS_EXIT_IRQ +# define LOCKDEP_SYS_EXIT_IRQ # endif -# define LOCKDEP_SYS_EXIT ARCH_LOCKDEP_SYS_EXIT -# define LOCKDEP_SYS_EXIT_IRQ ARCH_LOCKDEP_SYS_EXIT_IRQ -# else +#else # define LOCKDEP_SYS_EXIT # define LOCKDEP_SYS_EXIT_IRQ -# endif +#endif #endif /* __ASSEMBLY__ */ #endif From aa6d9a128b861fe7e9dc37bcc37179837674b739 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 25 Mar 2015 21:14:28 +0100 Subject: [PATCH 102/148] x86/irq/tracing: Do not save callee-preserved registers around lockdep_sys_exit_thunk Internally, lockdep_sys_exit_thunk saves callee-clobbered registers, and calls a C function, lockdep_sys_exit. Thus, callee-preserved registers won't be mangled, there is no need to save them. Patch was run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427314468-12763-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irqflags.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 19355f34c4a3..9a63eae04e4b 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -179,9 +179,7 @@ static inline int arch_irqs_disabled(void) # define LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_EXTRA_REGS; \ call lockdep_sys_exit_thunk; \ - RESTORE_EXTRA_REGS; \ cli; \ TRACE_IRQS_OFF; # else From 487d1edb9a6005cf790c7fe59f25ad1e5cb5817b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 27 Mar 2015 11:59:16 +0100 Subject: [PATCH 103/148] x86/asm/entry/64: Fix comment about SYSENTER MSRs The comment is ancient, it dates to the time when only AMD's x86_64 implementation existed. AMD wasn't (and still isn't) supporting SYSENTER, so these writes were "just in case" back then. This has changed: Intel's x86_64 appeared, and Intel does support SYSENTER in long mode. "Some future 64-bit CPU" is here already. The code may appear "buggy" for AMD as it stands, since MSR_IA32_SYSENTER_EIP is only 32-bit for AMD CPUs. Writing a kernel function's address to it would drop high bits. Subsequent use of this MSR for branch via SYSENTER seem to allow user to transition to CPL0 while executing his code. Scary, eh? Explain why that is not a bug: because SYSENTER insn would not work on AMD CPU. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427453956-21931-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index c928a7ae1099..71e4adcb15f1 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1173,8 +1173,10 @@ void syscall_init(void) #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, ia32_cstar_target); /* - * Always load these, in case some future 64-bit CPU supports - * SYSENTER from compat mode too: + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); From 27be87c5d53117f048d590d6fc6febb21176c3e9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 27 Mar 2015 11:36:19 +0100 Subject: [PATCH 104/148] x86/asm/entry/64: Add missing CFI annotation This is a missing bit of the recent MOV-to-PUSH conversion. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427452582-21624-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f85d2ccec6d2..dbfc8875d735 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -248,6 +248,7 @@ GLOBAL(system_call_after_swapgs) pushq_cfi_reg r10 /* pt_regs->r10 */ pushq_cfi_reg r11 /* pt_regs->r11 */ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 6*8 testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys From a232e3d558eef421fbb539ede5483dfb668e38f2 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 27 Mar 2015 11:36:20 +0100 Subject: [PATCH 105/148] x86/asm/entry/32: Update "interrupt off" comments The existing comment has proven to be not very clear. Replace it with a comment similar to the one we now have in the 64-bit syscall entry point. (Three instances, one per 32-bit syscall entry). In the INT80 entry point's CFI annotations, replace mysterious expressions with numric constants. In this case, raw numbers look more understandable. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427452582-21624-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 45 +++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 18 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 5d2641ce9957..7502ff0b938e 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -112,13 +112,16 @@ ENTRY(ia32_sysenter_target) CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ SWAPGS_UNSAFE_STACK movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs, here we enable it straight after entry: - */ ENABLE_INTERRUPTS(CLBR_NONE) + /* Construct iret frame (ss,rsp,rflags,cs,rip) */ movl %ebp,%ebp /* zero extension */ pushq_cfi $__USER32_DS @@ -314,15 +317,18 @@ ENTRY(ia32_cstar_target) CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ SWAPGS_UNSAFE_STACK movl %esp,%r8d CFI_REGISTER rsp,r8 movq PER_CPU_VAR(kernel_stack),%rsp - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: - */ ENABLE_INTERRUPTS(CLBR_NONE) + ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */ SAVE_C_REGS_EXCEPT_RCX_R891011 movl %eax,%eax /* zero extension */ @@ -449,19 +455,22 @@ ia32_badarg: ENTRY(ia32_syscall) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-RIP - /*CFI_REL_OFFSET ss,SS-RIP*/ - CFI_REL_OFFSET rsp,RSP-RIP - /*CFI_REL_OFFSET rflags,EFLAGS-RIP*/ - /*CFI_REL_OFFSET cs,CS-RIP*/ - CFI_REL_OFFSET rip,RIP-RIP + CFI_DEF_CFA rsp,5*8 + /*CFI_REL_OFFSET ss,4*8 */ + CFI_REL_OFFSET rsp,3*8 + /*CFI_REL_OFFSET rflags,2*8 */ + /*CFI_REL_OFFSET cs,1*8 */ + CFI_REL_OFFSET rip,0*8 + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS - /* - * No need to follow this irqs on/off section: the syscall - * disabled irqs and here we enable it straight after entry: - */ ENABLE_INTERRUPTS(CLBR_NONE) + movl %eax,%eax pushq_cfi %rax /* store orig_ax */ cld From 4ee8ec17ba00fce4af042543771f996fb9d98d34 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 27 Mar 2015 11:36:21 +0100 Subject: [PATCH 106/148] x86/asm/entry/32: Make register zero-extension more prominent There are a couple of syscall argument zero-extension instructions in the 32-bit compat entry code, and it was mentioned that people keep trying to optimize them out, introducing bugs. Make them more visible, and add a "do not remove" comment. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427452582-21624-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 7502ff0b938e..dec8c1de9c9e 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -122,8 +122,11 @@ ENTRY(ia32_sysenter_target) movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp ENABLE_INTERRUPTS(CLBR_NONE) + /* Zero-extending 32-bit regs, do not remove */ + movl %ebp, %ebp + movl %eax, %eax + /* Construct iret frame (ss,rsp,rflags,cs,rip) */ - movl %ebp,%ebp /* zero extension */ pushq_cfi $__USER32_DS /*CFI_REL_OFFSET ss,0*/ pushq_cfi %rbp @@ -134,7 +137,6 @@ ENTRY(ia32_sysenter_target) CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ - movl %eax, %eax /* Store thread_info->sysenter_return in rip stack slot */ pushq_cfi %r10 CFI_REL_OFFSET rip,0 @@ -329,9 +331,11 @@ ENTRY(ia32_cstar_target) movq PER_CPU_VAR(kernel_stack),%rsp ENABLE_INTERRUPTS(CLBR_NONE) + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */ SAVE_C_REGS_EXCEPT_RCX_R891011 - movl %eax,%eax /* zero extension */ movq %rax,ORIG_RAX(%rsp) movq %rcx,RIP(%rsp) CFI_REL_OFFSET rip,RIP @@ -471,7 +475,9 @@ ENTRY(ia32_syscall) SWAPGS ENABLE_INTERRUPTS(CLBR_NONE) - movl %eax,%eax + /* Zero-extending 32-bit regs, do not remove */ + movl %eax,%eax + pushq_cfi %rax /* store orig_ax */ cld /* note the registers are not zero extended to the sf. From 627276cb55f33e2dc56eab5b973937c128b7704c Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 30 Mar 2015 20:09:31 +0200 Subject: [PATCH 107/148] x86/asm/entry/64: Move retint_kernel code block closer to its user The "retint_kernel" code block is misplaced. Since its logical continuation is "retint_restore_args", it is more natural to place it above that label. This also makes two jumps "short". This change only moves code block around, without changing logic. This enables the next simplification: making "retint_restore_args" label a local numeric one. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427738975-7391-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index dbfc8875d735..34d60c34fca8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -740,7 +740,19 @@ opportunistic_sysret_failed: SWAPGS jmp restore_args -retint_restore_args: /* return to kernel space */ +/* Returning to kernel space */ +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ +ENTRY(retint_kernel) + cmpl $0,PER_CPU_VAR(__preempt_count) + jnz retint_restore_args + bt $9,EFLAGS(%rsp) /* interrupts were off? */ + jnc retint_restore_args + call preempt_schedule_irq + jmp exit_intr +#endif +retint_restore_args: DISABLE_INTERRUPTS(CLBR_ANY) /* * The iretq could re-enable interrupts: @@ -830,17 +842,6 @@ retint_signal: GET_THREAD_INFO(%rcx) jmp retint_with_reschedule -#ifdef CONFIG_PREEMPT - /* Returning to kernel space. Check if we need preemption */ - /* rcx: threadinfo. interrupts off. */ -ENTRY(retint_kernel) - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz retint_restore_args - bt $9,EFLAGS(%rsp) /* interrupts off? */ - jnc retint_restore_args - call preempt_schedule_irq - jmp exit_intr -#endif CFI_ENDPROC END(common_interrupt) From a3675b32aac81c2c4733568844f8276527a37423 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 30 Mar 2015 20:09:34 +0200 Subject: [PATCH 108/148] x86/asm/entry/64: Do not GET_THREAD_INFO() too early At exit_intr, we GET_THREAD_INFO(%rcx) and then jump to retint_kernel if saved CS was from kernel. But the code at retint_kernel doesn't need %rcx. Move GET_THREAD_INFO(%rcx) down, after CS check and branch. While at it, remove "has a correct top of stack" comment. After recent changes which eliminated FIXUP_TOP_OF_STACK, we always have a correct pt_regs layout. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427738975-7391-5-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 34d60c34fca8..6f251a5ee1dc 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -658,13 +658,12 @@ ret_from_intr: CFI_ADJUST_CFA_OFFSET RBP exit_intr: - GET_THREAD_INFO(%rcx) testl $3,CS(%rsp) je retint_kernel - /* Interrupt came from user space */ + + GET_THREAD_INFO(%rcx) /* - * Has a correct top of stack. * %rcx: thread info. Interrupts off. */ retint_with_reschedule: From 55474c48b4726fd3914c1ec47fced0f931729979 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Mar 2015 11:02:34 +0200 Subject: [PATCH 109/148] x86/asm/entry: Remove user_mode_ignore_vm86() user_mode_ignore_vm86() can be used instead of user_mode(), in places where we have already done a v8086_mode() security check of ptregs. But doing this check in the wrong place would be a bug that could result in security problems, and also the naming still isn't very clear. Furthermore, it only affects 32-bit kernels, while most development happens on 64-bit kernels. If we replace them with user_mode() checks then the cost is only a very minor increase in various slowpaths: text data bss dec hex filename 10573391 703562 1753042 13029995 c6d26b vmlinux.o.before 10573423 703562 1753042 13030027 c6d28b vmlinux.o.after So lets get rid of this distinction once and for all. Acked-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andrew Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brad Spengler Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150329090233.GA1963@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 17 ----------------- arch/x86/kernel/cpu/perf_event.c | 2 +- arch/x86/kernel/traps.c | 6 +++--- 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index d20bae298852..19507ffa5d28 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -113,23 +113,6 @@ static inline int user_mode(struct pt_regs *regs) #endif } -/* - * This is the fastest way to check whether regs come from user space. - * It is unsafe if regs might come from vm86 mode, though -- in vm86 - * mode, all bits of CS and SS are completely under the user's control. - * The CPU considers vm86 mode to be CPL 3 regardless of CS and SS. - * - * Do NOT use this function unless you have already ruled out the - * possibility that regs came from vm86 mode. - * - * We check for RPL != 0 instead of RPL == 3 because we don't use rings - * 1 or 2 and this is more efficient. - */ -static inline int user_mode_ignore_vm86(struct pt_regs *regs) -{ - return (regs->cs & SEGMENT_RPL_MASK) != 0; -} - static inline int v8086_mode(struct pt_regs *regs) { #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 56f7e60ad732..e2888a3ad1e3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2159,7 +2159,7 @@ static unsigned long code_segment_base(struct pt_regs *regs) if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; - if (user_mode_ignore_vm86(regs) && regs->cs != __USER_CS) + if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else if (user_mode(regs) && !user_64bit_mode(regs) && diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index c8eb469a94a4..6751c5c58eec 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -207,7 +207,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, return -1; } - if (!user_mode_ignore_vm86(regs)) { + if (!user_mode(regs)) { if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; tsk->thread.trap_nr = trapnr; @@ -468,7 +468,7 @@ do_general_protection(struct pt_regs *regs, long error_code) } tsk = current; - if (!user_mode_ignore_vm86(regs)) { + if (!user_mode(regs)) { if (fixup_exception(regs)) goto exit; @@ -685,7 +685,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) * We already checked v86 mode above, so we can check for kernel mode * by just checking the CPL of CS. */ - if ((dr6 & DR_STEP) && !user_mode_ignore_vm86(regs)) { + if ((dr6 & DR_STEP) && !user_mode(regs)) { tsk->thread.debugreg6 &= ~DR_STEP; set_tsk_thread_flag(tsk, TIF_SINGLESTEP); regs->flags &= ~X86_EFLAGS_TF; From 4416c5a6dacdddd55378e7011f9c8720d2a7470f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:03 +0200 Subject: [PATCH 110/148] x86/asm/entry/64: Do not TRACE_IRQS fast SYSRET64 path SYSRET code path has a small irq-off block. On this code path, TRACE_IRQS_ON can't be called right before interrupts are enabled for real, we can't clobber registers there. So current code does it earlier, in a safe place. But with this, TRACE_IRQS_OFF/ON frames just two fast instructions, which is ridiculous: now most of irq-off block is _outside_ of the framing. Do the same thing that we do on SYSCALL entry: do not track this irq-off block, it is very small to ever cause noticeable irq latency. Be careful: make sure that "jnz int_ret_from_sys_call_irqs_off" now does invoke TRACE_IRQS_OFF - move int_ret_from_sys_call_irqs_off label before TRACE_IRQS_OFF. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 6f251a5ee1dc..f6e37de02ba7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -269,8 +269,11 @@ system_call_fastpath: * Has incompletely filled pt_regs. */ LOCKDEP_SYS_EXIT + /* + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF /* * We must check ti flags with interrupts (or at least preemption) @@ -284,10 +287,7 @@ system_call_fastpath: jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ CFI_REMEMBER_STATE - /* - * sysretq will re-enable interrupts: - */ - TRACE_IRQS_ON + RESTORE_C_REGS_EXCEPT_RCX_R11 movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx @@ -298,6 +298,7 @@ system_call_fastpath: * 64bit SYSRET restores rip from rcx, * rflags from r11 (but RF and VM bits are forced to 0), * cs and ss are loaded from MSRs. + * Restoration of rflags re-enables interrupts. */ USERGS_SYSRET64 @@ -346,8 +347,8 @@ tracesys_phase2: */ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) +int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF -int_ret_from_sys_call_irqs_off: movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) From 4c9c0e919fef05b3bc6a8aff1db7a31b2ba4f4b6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:04 +0200 Subject: [PATCH 111/148] x86/asm/entry/32: Use smaller PUSH instructions instead of MOV, to build 'pt_regs' on stack This mimics the recent similar 64-bit change. Saves ~110 bytes of code. Patch was run-tested on 32 and 64 bits, Intel and AMD CPU. I also looked at the diff of entry_64.o disassembly, to have a different view of the changes. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 82 ++++++++++++++++++++++----------------- 1 file changed, 46 insertions(+), 36 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index dec8c1de9c9e..8d01cce7b6b8 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -126,26 +126,27 @@ ENTRY(ia32_sysenter_target) movl %ebp, %ebp movl %eax, %eax - /* Construct iret frame (ss,rsp,rflags,cs,rip) */ - pushq_cfi $__USER32_DS - /*CFI_REL_OFFSET ss,0*/ - pushq_cfi %rbp - CFI_REL_OFFSET rsp,0 - pushfq_cfi - /*CFI_REL_OFFSET rflags,0*/ - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 3*8), %r10d + movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d CFI_REGISTER rip,r10 - pushq_cfi $__USER32_CS - /*CFI_REL_OFFSET cs,0*/ - /* Store thread_info->sysenter_return in rip stack slot */ - pushq_cfi %r10 - CFI_REL_OFFSET rip,0 - /* Store orig_ax */ - pushq_cfi %rax - /* Construct the rest of "struct pt_regs" */ + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER32_DS /* pt_regs->ss */ + pushq_cfi %rbp /* pt_regs->sp */ + CFI_REL_OFFSET rsp,0 + pushfq_cfi /* pt_regs->flags */ + pushq_cfi $__USER32_CS /* pt_regs->cs */ + pushq_cfi %r10 /* pt_regs->ip = thread_info->sysenter_return */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi_reg rax /* pt_regs->ax */ cld - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS_EXCEPT_R891011 + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + /* * no need to do an access_ok check here because rbp has been * 32bit zero extended @@ -334,20 +335,24 @@ ENTRY(ia32_cstar_target) /* Zero-extending 32-bit regs, do not remove */ movl %eax,%eax - ALLOC_PT_GPREGS_ON_STACK 6*8 /* 6*8: space for orig_ax and iret frame */ - SAVE_C_REGS_EXCEPT_RCX_R891011 - movq %rax,ORIG_RAX(%rsp) - movq %rcx,RIP(%rsp) - CFI_REL_OFFSET rip,RIP - movq %rbp,RCX(%rsp) /* this lies slightly to ptrace */ + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER32_DS /* pt_regs->ss */ + pushq_cfi %r8 /* pt_regs->sp */ + CFI_REL_OFFSET rsp,0 + pushq_cfi %r11 /* pt_regs->flags */ + pushq_cfi $__USER32_CS /* pt_regs->cs */ + pushq_cfi %rcx /* pt_regs->ip */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rbp /* pt_regs->cx */ movl %ebp,%ecx - movq $__USER32_CS,CS(%rsp) - movq $__USER32_DS,SS(%rsp) - movq %r11,EFLAGS(%rsp) - /*CFI_REL_OFFSET rflags,EFLAGS*/ - movq %r8,RSP(%rsp) - CFI_REL_OFFSET rsp,RSP - /* iret stack frame is complete now */ + pushq_cfi_reg rax /* pt_regs->ax */ + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + /* * no need to do an access_ok check here because r8 has been * 32bit zero extended @@ -478,12 +483,17 @@ ENTRY(ia32_syscall) /* Zero-extending 32-bit regs, do not remove */ movl %eax,%eax - pushq_cfi %rax /* store orig_ax */ + /* Construct struct pt_regs on stack (iret frame is already on stack) */ + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi_reg rax /* pt_regs->ax */ cld - /* note the registers are not zero extended to the sf. - this could be a problem. */ - ALLOC_PT_GPREGS_ON_STACK - SAVE_C_REGS_EXCEPT_R891011 + sub $(10*8),%rsp /* pt_regs->r8-11,bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 10*8 + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz ia32_tracesys From 6ba71b7617f1fa65f19bd34f4484a0694ef9a520 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:05 +0200 Subject: [PATCH 112/148] x86/asm/entry/64: Simplify retint_kernel label usage, make retint_restore_args label local Get rid of #define obfuscation of retint_kernel in CONFIG_PREEMPT case by defining retint_kernel label always, not only for CONFIG_PREEMPT. Strip retint_kernel of .global-ness (ENTRY macro) - it has no users outside of this file. This looks like cosmetics, but it is not: "je LABEL" can be optimized to short jump by assember only if LABEL is not global, for global labels jump is always a near one with relocation. Convert retint_restore_args to a local numeric label, making it clearer that it is not used elsewhere in the file. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f6e37de02ba7..1879c55bf0f7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -57,10 +57,6 @@ .section .entry.text, "ax" -#ifndef CONFIG_PREEMPT -#define retint_kernel retint_restore_args -#endif - #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) swapgs @@ -741,18 +737,18 @@ opportunistic_sysret_failed: jmp restore_args /* Returning to kernel space */ +retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ -ENTRY(retint_kernel) cmpl $0,PER_CPU_VAR(__preempt_count) - jnz retint_restore_args + jnz 1f bt $9,EFLAGS(%rsp) /* interrupts were off? */ - jnc retint_restore_args + jnc 1f call preempt_schedule_irq jmp exit_intr +1: #endif -retint_restore_args: DISABLE_INTERRUPTS(CLBR_ANY) /* * The iretq could re-enable interrupts: From 32a04077fe401842424a4b555572fa459c01e0a3 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:06 +0200 Subject: [PATCH 113/148] x86/asm/entry/64: Remove redundant DISABLE_INTERRUPTS() At this location, we already have interrupts off, always. To be more specific, we already disabled them here: ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1879c55bf0f7..9f8d01f1f1b2 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -749,7 +749,6 @@ retint_kernel: jmp exit_intr 1: #endif - DISABLE_INTERRUPTS(CLBR_ANY) /* * The iretq could re-enable interrupts: */ From 36acef2510853e2831047ca9e22d333ba7a1047b Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:07 +0200 Subject: [PATCH 114/148] x86/asm/entry/64: Simplify looping around preempt_schedule_irq() At the 'exit_intr' label we test whether interrupt/exception was in kernel. If it did, we jump to the preemption check. If preemption does happen (IOW if we call preempt_schedule_irq()), we go back to 'exit_intr'. But it's pointless, we already know that the test succeeded last time, preemption doesn't change the fact that interrupt/exception was in the kernel. We can go back directly to checking PER_CPU_VAR(__preempt_count) instead. This makes the 'exit_intr' label unused, drop it. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-5-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 9f8d01f1f1b2..bad285d84a9f 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -654,7 +654,6 @@ ret_from_intr: CFI_DEF_CFA_REGISTER rsp CFI_ADJUST_CFA_OFFSET RBP -exit_intr: testl $3,CS(%rsp) je retint_kernel /* Interrupt came from user space */ @@ -741,12 +740,12 @@ retint_kernel: #ifdef CONFIG_PREEMPT /* Interrupts are off */ /* Check if we need preemption */ - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz 1f bt $9,EFLAGS(%rsp) /* interrupts were off? */ jnc 1f +0: cmpl $0,PER_CPU_VAR(__preempt_count) + jnz 1f call preempt_schedule_irq - jmp exit_intr + jmp 0b 1: #endif /* From a734b4a23e4b5a5bba577d11b6e2ff21f6ca4fce Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:10 +0200 Subject: [PATCH 115/148] x86/asm: Replace "MOVQ $imm, %reg" with MOVL There is no reason to use MOVQ to load a non-negative immediate constant value into a 64-bit register. MOVL does the same, since the upper 32 bits are zero-extended by the CPU. This makes the code a bit smaller, while leaving functionality unchanged. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-8-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 2 +- arch/x86/crypto/twofish-x86_64-asm_64.S | 4 ++-- arch/x86/kernel/relocate_kernel_64.S | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S index 26d49ebae040..225be06edc80 100644 --- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -178,7 +178,7 @@ continue_block: ## 2a) PROCESS FULL BLOCKS: ################################################################ full_block: - movq $128,%rax + movl $128,%eax lea 128*8*2(block_0), block_1 lea 128*8*3(block_0), block_2 add $128*8*1, block_0 diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index a039d21986a2..a350c990dc86 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -264,7 +264,7 @@ ENTRY(twofish_enc_blk) movq R1, 8(%rsi) popq R1 - movq $1,%rax + movl $1,%eax ret ENDPROC(twofish_enc_blk) @@ -316,6 +316,6 @@ ENTRY(twofish_dec_blk) movq R1, 8(%rsi) popq R1 - movq $1,%rax + movl $1,%eax ret ENDPROC(twofish_dec_blk) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 04cb1790a596..98111b38ebfd 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -123,7 +123,7 @@ identity_mapped: * Set cr4 to a known state: * - physical address extension enabled */ - movq $X86_CR4_PAE, %rax + movl $X86_CR4_PAE, %eax movq %rax, %cr4 jmp 1f @@ -246,17 +246,17 @@ swap_pages: movq %rsi, %rax movq %r10, %rdi - movq $512, %rcx + movl $512, %ecx rep ; movsq movq %rax, %rdi movq %rdx, %rsi - movq $512, %rcx + movl $512, %ecx rep ; movsq movq %rdx, %rdi movq %r10, %rsi - movq $512, %rcx + movl $512, %ecx rep ; movsq lea PAGE_SIZE(%rax), %rsi From a6de5a21fb25cdbbdf3c3e9afd8481581c4f2464 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 31 Mar 2015 19:00:11 +0200 Subject: [PATCH 116/148] x86/asm/entry/64: Use local label to skip around sycall dispatch Logically, we just want to jump around the following instruction and its prologue/epilogue: call *sys_call_table(,%rax,8) if the syscall number is too big - we do not specifically target the "int_ret_from_sys_call" label. Use a local, numerical label for this jump, for more clarity. This also makes the code smaller: -ffffffff8187756b: 0f 87 0f 00 00 00 ja ffffffff81877580 +ffffffff8187756b: 77 0f ja ffffffff8187757c because jumps to global labels are never translated to short jump instructions by GAS. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427821211-25099-9-git-send-email-dvlasenk@redhat.com [ Improved the changelog. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index bad285d84a9f..03c52e217680 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -331,10 +331,11 @@ tracesys_phase2: andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja int_ret_from_sys_call /* RAX(%rsp) is already set */ + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) movq %rax,RAX(%rsp) +1: /* Use IRET because user could have changed pt_regs->foo */ /* From 0784b36448a2a85b95b6eb21a69b9045c896c065 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 1 Apr 2015 16:50:57 +0200 Subject: [PATCH 117/148] x86/asm/entry/64: Fold the 'test_in_nmi' macro into its only user No code changes. Signed-off-by: Denys Vlasenko Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427899858-7165-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 03c52e217680..386375d43d14 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1355,19 +1355,7 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) -/* - * Test if a given stack is an NMI stack or not. - */ - .macro test_in_nmi reg stack nmi_ret normal_ret - cmpq %\reg, \stack - ja \normal_ret - subq $EXCEPTION_STKSZ, %\reg - cmpq %\reg, \stack - jb \normal_ret - jmp \nmi_ret - .endm - - /* runs on exception stack */ +/* Runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1428,8 +1416,18 @@ ENTRY(nmi) * We check the variable because the first NMI could be in a * breakpoint routine using a breakpoint stack. */ - lea 6*8(%rsp), %rdx - test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ + jmp nested_nmi + CFI_REMEMBER_STATE nested_nmi: From 40e4f2d177f748a83e7639554ea7d11568a9fa1f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 1 Apr 2015 16:50:58 +0200 Subject: [PATCH 118/148] x86/asm/boot/64: Use __BOOT_TSS instead of literal $0x20 __BOOT_TSS = (GDT_ENTRY_BOOT_TSS * 8) GDT_ENTRY_BOOT_TSS = (GDT_ENTRY_BOOT_CS + 2) GDT_ENTRY_BOOT_CS = 2 (2 + 2) * 8 = 4 * 8 = 32 = 0x20 No code changes. Signed-off-by: Denys Vlasenko Reviewed-by: Steven Rostedt Acked-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Will Drewry Link: http://lkml.kernel.org/r/1427899858-7165-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/head_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 90c152135e92..b0c0d16ef58d 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -165,7 +165,7 @@ ENTRY(startup_32) /* After gdt is loaded */ xorl %eax, %eax lldt %ax - movl $0x20, %eax + movl $__BOOT_TSS, %eax ltr %ax /* From 3f85483bd80ef1de8cbbf0361be59f6a069b59d4 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 1 Apr 2015 10:12:14 -0400 Subject: [PATCH 119/148] x86/cpu: Factor out common CPU initialization code, fix 32-bit Xen PV guests Some of x86 bare-metal and Xen CPU initialization code is common between the two and therefore can be factored out to avoid code duplication. As a side effect, doing so will also extend the fix provided by commit a7fcf28d431e ("x86/asm/entry: Replace this_cpu_sp0() with current_top_of_stack() to x86_32") to 32-bit Xen PV guests. Signed-off-by: Boris Ostrovsky Reviewed-by: David Vrabel Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: konrad.wilk@oracle.com Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1427897534-5086-1-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 37 ++++++++++++++++++++++--------------- arch/x86/xen/smp.c | 13 +------------ 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 8cd1cc3bc835..81d02fc7dafa 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -154,6 +154,7 @@ void cpu_die_common(unsigned int cpu); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); void native_smp_cpus_done(unsigned int max_cpus); +void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_disable(void); void native_cpu_die(unsigned int cpu); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7b20ffd2fffc..5b298a95d567 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -779,6 +779,26 @@ out: return boot_error; } +void common_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); + + per_cpu(current_task, cpu) = idle; + +#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); + per_cpu(cpu_current_top_of_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; +#else + clear_tsk_thread_flag(idle, TIF_FORK); + initial_gs = per_cpu_offset(cpu); +#endif + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -796,24 +816,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - /* Just in case we booted with a single CPU. */ - alternatives_enable_smp(); - idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(idle))) - 1); - per_cpu(current_task, cpu) = idle; -#ifdef CONFIG_X86_32 - /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); - per_cpu(cpu_current_top_of_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; -#else - clear_tsk_thread_flag(idle, TIF_FORK); - initial_gs = per_cpu_offset(cpu); -#endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; @@ -954,6 +959,8 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) /* the FPU context is blank, nobody can own it */ __cpu_disable_lazy_restore(cpu); + common_cpu_up(cpu, tidle); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 765b7684f858..7413ee3706d0 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -445,14 +445,7 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) { int rc; - per_cpu(current_task, cpu) = idle; -#ifdef CONFIG_X86_32 - irq_ctx_init(cpu); -#else - clear_tsk_thread_flag(idle, TIF_FORK); -#endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) + THREAD_SIZE; + common_cpu_up(cpu, idle); xen_setup_runstate_info(cpu); xen_setup_timer(cpu); @@ -467,10 +460,6 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle) if (rc) return rc; - if (num_online_cpus() == 1) - /* Just in case we booted with a single CPU. */ - alternatives_enable_smp(); - rc = xen_smp_intr_init(cpu); if (rc) return rc; From d9dc64f30abe42f71bc7e9eb9d38c41006cf39f9 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 27 Jan 2015 09:53:51 -0700 Subject: [PATCH 120/148] x86/asm: Add support for the CLWB instruction Add support for the new CLWB (cache line write back) instruction. This instruction was announced in the document "Intel Architecture Instruction Set Extensions Programming Reference" with reference number 319433-022. https://software.intel.com/sites/default/files/managed/0d/53/319433-022.pdf The CLWB instruction is used to write back the contents of dirtied cache lines to memory without evicting the cache lines from the processor's cache hierarchy. This should be used in favor of clflushopt or clflush in cases where you require the cache line to be written to memory but plan to access the data again in the near future. One of the main use cases for this is with persistent memory where CLWB can be used with PCOMMIT to ensure that data has been accepted to memory and is durable on the DIMM. This function shows how to properly use CLWB/CLFLUSHOPT/CLFLUSH and PCOMMIT with appropriate fencing: void flush_and_commit_buffer(void *vaddr, unsigned int size) { void *vend = vaddr + size - 1; for (; vaddr < vend; vaddr += boot_cpu_data.x86_clflush_size) clwb(vaddr); /* Flush any possible final partial cacheline */ clwb(vend); /* * Use SFENCE to order CLWB/CLFLUSHOPT/CLFLUSH cache flushes. * (MFENCE via mb() also works) */ wmb(); /* PCOMMIT and the required SFENCE for ordering */ pcommit_sfence(); } After this function completes the data pointed to by vaddr is has been accepted to memory and will be durable if the vaddr points to persistent memory. Regarding the details of how the alternatives assembly is set up, we need one additional byte at the beginning of the CLFLUSH so that we can flip it into a CLFLUSHOPT by changing that byte into a 0x66 prefix. Two options are to either insert a 1 byte ASM_NOP1, or to add a 1 byte NOP_DS_PREFIX. Both have no functional effect with the plain CLFLUSH, but I've been told that executing a CLFLUSH + prefix should be faster than executing a CLFLUSH + NOP. We had to hard code the assembly for CLWB because, lacking the ability to assemble the CLWB instruction itself, the next closest thing is to have an xsaveopt instruction with a 0x66 prefix. Unfortunately XSAVEOPT itself is also relatively new, and isn't included by all the GCC versions that the kernel needs to support. Signed-off-by: Ross Zwisler Acked-by: Borislav Petkov Acked-by: H. Peter Anvin Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1422377631-8986-3-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/cpufeature.h | 1 + arch/x86/include/asm/special_insns.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 0f7a5a1a8db2..854c04b3c9c2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -233,6 +233,7 @@ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ #define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 2ec1a5392542..aeb4666e0c0a 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -201,6 +201,20 @@ static inline void clflushopt(volatile void *__p) "+m" (*(volatile char __force *)__p)); } +static inline void clwb(volatile void *__p) +{ + volatile struct { char x[64]; } *p = __p; + + asm volatile(ALTERNATIVE_2( + ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", + ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ + X86_FEATURE_CLFLUSHOPT, + ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ + X86_FEATURE_CLWB) + : [p] "+m" (*p) + : [pax] "a" (p)); +} + static inline void pcommit_sfence(void) { alternative(ASM_NOP7, From ff8287f36381deff729aa4e7b02296a080519fd0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Apr 2015 12:41:44 -0700 Subject: [PATCH 121/148] x86/asm/entry/32: Improve a TOP_OF_KERNEL_STACK_PADDING comment At Denys' request, clean up the comment describing stack padding in the 32-bit sysenter path. No code changes. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/41fee7bb8490ae840fe7ef2699f9c2feb932e729.1428002830.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 4c8cc34e6d68..effa2793feba 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -395,10 +395,13 @@ sysenter_past_esp: /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary - 4*4 means the 4 words - * pushed above; +8 corresponds to copy_thread's esp0 setting. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp) + pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax From cf9328cc9989e028fdc64d8c0a7b1b043dc96735 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Apr 2015 12:41:45 -0700 Subject: [PATCH 122/148] x86/asm/entry/32: Stop caching MSR_IA32_SYSENTER_ESP in tss.sp1 We write a stack pointer to MSR_IA32_SYSENTER_ESP exactly once, and we unnecessarily cache the value in tss.sp1. We never read the cached value. Remove all of the caching. It serves no purpose. Suggested-by: Denys Vlasenko Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/05a0163eb33ef5208363f0015496855da7cebadd.1428002830.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 22 +++++++++++----------- arch/x86/kernel/cpu/common.c | 9 +++++---- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 572099710ba2..d2203b5d9538 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -209,21 +209,21 @@ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; + unsigned long sp1; /* - * We don't use ring 1, so sp1 and ss1 are convenient scratch - * spaces in the same cacheline as sp0. We use them to cache - * some MSR values to avoid unnecessary wrmsr instructions. + * We don't use ring 1, so ss1 is a convenient scratch space in + * the same cacheline as sp0. We use ss1 to cache the value in + * MSR_IA32_SYSENTER_CS. When we context switch + * MSR_IA32_SYSENTER_CS, we first check if the new value being + * written matches ss1, and, if it's not, then we wrmsr the new + * value and update ss1. * - * We use SYSENTER_ESP to find sp0 and for the NMI emergency - * stack, but we need to context switch it because we do - * horrible things to the kernel stack in vm86 mode. - * - * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid - * corrupting the stack if we went through the sysenter path - * from vm86 mode. + * The only reason we context switch MSR_IA32_SYSENTER_CS is + * that we set it to zero in vm86 tasks to avoid corrupting the + * stack if we were to go through the sysenter path from vm86 + * mode. */ - unsigned long sp1; /* MSR_IA32_SYSENTER_ESP */ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ unsigned short __ss1h; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 71e4adcb15f1..a383d53bf0ed 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -976,15 +976,16 @@ void enable_sep_cpu(void) goto out; /* - * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware, - * we cache the SYSENTER CS and ESP values there for easy access: + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- + * see the big comment in struct x86_hw_tss's definition. */ tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); + wrmsr(MSR_IA32_SYSENTER_ESP, + (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), + 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); From 4214a16b02971c60960afd675d03544e109e0d75 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 2 Apr 2015 17:12:12 -0700 Subject: [PATCH 123/148] x86/asm/entry/64/compat: Use SYSRETL to return from compat mode SYSENTER SYSEXIT is scary on 64-bit kernels -- SYSEXIT must be invoked with usergs and IRQs on. That means that we rely on STI to correctly mask interrupts for one instruction. This is okay by itself, but the semantics with respect to NMIs are unclear. Avoid the whole issue by using SYSRETL instead. For background, Intel CPUs don't allow SYSCALL from compat mode, but they do allow SYSRETL back to compat mode. Go figure. To avoid doing too much at once, this doesn't revamp the calling convention. We still return with EBP, EDX, and ECX on the user stack. Oddly this seems to be 30 cycles or so faster. Avoiding POPFQ and STI will account for under half of that, I think, so my best guess is that Intel just optimizes SYSRET much better than SYSEXIT. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/57a0bf1b5230b2716a64ebe48e9bc1110f7ab433.1428019097.git.luto@kernel.org Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 53 +++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 8d01cce7b6b8..5d8f987a340d 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -186,28 +186,55 @@ sysenter_dispatch: testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysexit_audit sysexit_from_sys_call: + /* + * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an + * NMI between STI and SYSEXIT has poorly specified behavior, + * and and NMI followed by an IRQ with usergs is fatal. So + * we just pretend we're using SYSEXIT but we really use + * SYSRETL instead. + * + * This code path is still called 'sysexit' because it pairs + * with 'sysenter' and it uses the SYSENTER calling convention. + */ andl $~TS_COMPAT,ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS(%rsp) - movl RIP(%rsp),%edx /* User %eip */ - CFI_REGISTER rip,rdx + movl RIP(%rsp),%ecx /* User %eip */ + CFI_REGISTER rip,rcx RESTORE_RSI_RDI - /* pop everything except ss,rsp,rflags slots */ - REMOVE_PT_GPREGS_FROM_STACK 3*8 + xorl %edx,%edx /* avoid info leaks */ xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 - xorq %r11,%r11 - popfq_cfi + movl EFLAGS(%rsp),%r11d /* User eflags */ /*CFI_RESTORE rflags*/ - popq_cfi %rcx /* User %esp */ - CFI_REGISTER rsp,rcx TRACE_IRQS_ON + /* - * 32bit SYSEXIT restores eip from edx, esp from ecx. - * cs and ss are loaded from MSRs. + * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, + * since it avoids a dicey window with interrupts enabled. */ - ENABLE_INTERRUPTS_SYSEXIT32 + movl RSP(%rsp),%esp + + /* + * USERGS_SYSRET32 does: + * gsbase = user's gs base + * eip = ecx + * rflags = r11 + * cs = __USER32_CS + * ss = __USER_DS + * + * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: + * + * pop %ebp + * pop %edx + * pop %ecx + * + * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to + * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's + * address (already known to user code), and R12-R15 are + * callee-saved and therefore don't contain any interesting + * kernel data. + */ + USERGS_SYSRET32 CFI_RESTORE_STATE From 47091e3c5b072daca29a15d2a3caf40359b0d140 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Apr 2015 10:28:34 +0200 Subject: [PATCH 124/148] x86/asm/entry: Drop now unused ENABLE_INTERRUPTS_SYSEXIT32 Commit: 4214a16b0297 ("x86/asm/entry/64/compat: Use SYSRETL to return from compat mode SYSENTER") removed the last user of ENABLE_INTERRUPTS_SYSEXIT32. Kill the macro now too. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/1428049714-829-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irqflags.h | 4 ---- arch/x86/include/asm/paravirt.h | 5 ----- 2 files changed, 9 deletions(-) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 9a63eae04e4b..b77f5edb03b0 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -136,10 +136,6 @@ static inline notrace unsigned long arch_local_irq_save(void) #define USERGS_SYSRET32 \ swapgs; \ sysretl -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - swapgs; \ - sti; \ - sysexit #else #define INTERRUPT_RETURN iret diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 965c47d254aa..5f6051d5d139 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -976,11 +976,6 @@ extern void default_banner(void); PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) - -#define ENABLE_INTERRUPTS_SYSEXIT32 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ From 78cac48c0434c82e860fade3cd0420a7a4adbb08 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 1 Apr 2015 12:49:52 +0200 Subject: [PATCH 125/148] x86/mm/KASLR: Propagate KASLR status to kernel proper Commit: e2b32e678513 ("x86, kaslr: randomize module base load address") made module base address randomization unconditional and didn't regard disabled KKASLR due to CONFIG_HIBERNATION and command line option "nokaslr". For more info see (now reverted) commit: f47233c2d34f ("x86/mm/ASLR: Propagate base load address calculation") In order to propagate KASLR status to kernel proper, we need a single bit in boot_params.hdr.loadflags and we've chosen bit 1 thus leaving the top-down allocated bits for bits supposed to be used by the bootloader. Originally-From: Jiri Kosina Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Cc: Kees Cook Signed-off-by: Ingo Molnar --- Documentation/x86/boot.txt | 6 ++++++ arch/x86/boot/compressed/aslr.c | 5 ++++- arch/x86/boot/compressed/misc.c | 5 ++++- arch/x86/boot/compressed/misc.h | 6 ++++-- arch/x86/include/asm/setup.h | 5 +++++ arch/x86/include/uapi/asm/bootparam.h | 1 + arch/x86/kernel/module.c | 11 ++--------- arch/x86/kernel/setup.c | 13 +++++++++---- 8 files changed, 35 insertions(+), 17 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index a75e3adaa39d..88b85899d309 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -406,6 +406,12 @@ Protocol: 2.00+ - If 0, the protected-mode code is loaded at 0x10000. - If 1, the protected-mode code is loaded at 0x100000. + Bit 1 (kernel internal): ALSR_FLAG + - Used internally by the compressed kernel to communicate + KASLR status to kernel proper. + If 1, KASLR enabled. + If 0, KASLR disabled. + Bit 5 (write): QUIET_FLAG - If 0, print early messages. - If 1, suppress early messages. diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index bb1376381985..d7b1f655b3ef 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -295,7 +295,8 @@ static unsigned long find_random_addr(unsigned long minimum, return slots_fetch_random(); } -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *boot_params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) @@ -315,6 +316,8 @@ unsigned char *choose_kernel_location(unsigned char *input, } #endif + boot_params->hdr.loadflags |= KASLR_FLAG; + /* Record the various known unsafe memory ranges. */ mem_avoid_init((unsigned long)input, input_size, (unsigned long)output, output_size); diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index a950864a64da..a107b935e22f 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -377,6 +377,9 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, real_mode = rmode; + /* Clear it for solely in-kernel use */ + real_mode->hdr.loadflags &= ~KASLR_FLAG; + sanitize_boot_params(real_mode); if (real_mode->screen_info.orig_video_mode == 7) { @@ -401,7 +404,7 @@ asmlinkage __visible void *decompress_kernel(void *rmode, memptr heap, * the entire decompressed kernel plus relocation table, or the * entire decompressed kernel plus .bss and .brk sections. */ - output = choose_kernel_location(input_data, input_len, output, + output = choose_kernel_location(real_mode, input_data, input_len, output, output_len > run_size ? output_len : run_size); diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h index 04477d68403f..89dd0d78013a 100644 --- a/arch/x86/boot/compressed/misc.h +++ b/arch/x86/boot/compressed/misc.h @@ -57,7 +57,8 @@ int cmdline_find_option_bool(const char *option); #if CONFIG_RANDOMIZE_BASE /* aslr.c */ -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *boot_params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size); @@ -65,7 +66,8 @@ unsigned char *choose_kernel_location(unsigned char *input, bool has_cpuflag(int flag); #else static inline -unsigned char *choose_kernel_location(unsigned char *input, +unsigned char *choose_kernel_location(struct boot_params *boot_params, + unsigned char *input, unsigned long input_size, unsigned char *output, unsigned long output_size) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ff4e7b236e21..f69e06b283fb 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -66,6 +66,11 @@ static inline void x86_ce4100_early_setup(void) { } */ extern struct boot_params boot_params; +static inline bool kaslr_enabled(void) +{ + return !!(boot_params.hdr.loadflags & KASLR_FLAG); +} + /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index 225b0988043a..ab456dc233b5 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -15,6 +15,7 @@ /* loadflags */ #define LOADED_HIGH (1<<0) +#define KASLR_FLAG (1<<1) #define QUIET_FLAG (1<<5) #define KEEP_SEGMENTS (1<<6) #define CAN_USE_HEAP (1<<7) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d1ac80b72c72..005c03e93fc5 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -33,6 +33,7 @@ #include #include +#include #if 0 #define DEBUGP(fmt, ...) \ @@ -47,21 +48,13 @@ do { \ #ifdef CONFIG_RANDOMIZE_BASE static unsigned long module_load_offset; -static int randomize_modules = 1; /* Mutex protects the module_load_offset. */ static DEFINE_MUTEX(module_kaslr_mutex); -static int __init parse_nokaslr(char *p) -{ - randomize_modules = 0; - return 0; -} -early_param("nokaslr", parse_nokaslr); - static unsigned long int get_module_load_offset(void) { - if (randomize_modules) { + if (kaslr_enabled()) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a2421cca01f..014466b152b5 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - pr_emerg("Kernel Offset: 0x%lx from 0x%lx " - "(relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, __START_KERNEL, - __START_KERNEL_map, MODULES_VADDR-1); + if (kaslr_enabled()) { + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } return 0; } From 7c74d5b7b7b63fc279ed446ebd78de20d81f2824 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Apr 2015 11:42:10 +0200 Subject: [PATCH 126/148] x86/asm/entry/64: Fix MSR_IA32_SYSENTER_CS MSR value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit: d56fe4bf5f3c ("x86/asm/entry/64: Always set up SYSENTER MSRs") missed to add "ULL" to the 0 and wrmsrl_safe() complains: arch/x86/kernel/cpu/common.c: In function ‘syscall_init’: arch/x86/kernel/cpu/common.c:1226:2: warning: right shift count >= width of type wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0); Fix it. Signed-off-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1428054130-25847-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a383d53bf0ed..d56d30714d43 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1184,7 +1184,7 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); #else wrmsrl(MSR_CSTAR, ignore_sysret); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif From 6b51311c976593fb7311322b1647a912cd456ec4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 3 Apr 2015 14:25:28 +0200 Subject: [PATCH 127/148] x86/asm/entry/64: Use a define for an invalid segment selector ... instead of a naked number, for better readability. Signed-off-by: Borislav Petkov Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1428054130-25847-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/segment.h | 2 ++ arch/x86/kernel/cpu/common.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index d394899e055c..5a9856eb12ba 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -39,6 +39,8 @@ /* ... GDT has it cleared */ #define SEGMENT_GDT 0x0 +#define GDT_ENTRY_INVALID_SEG 0 + #ifdef CONFIG_X86_32 /* * The layout of the per-CPU GDT under Linux: diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d56d30714d43..3f70538012e2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1184,7 +1184,7 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); #else wrmsrl(MSR_CSTAR, ignore_sysret); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif From dbe4058a6a44af4ca5d146aebe01b0a1f9b7fd2a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 4 Apr 2015 15:34:43 +0200 Subject: [PATCH 128/148] x86/alternatives: Fix ALTERNATIVE_2 padding generation properly Quentin caught a corner case with the generation of instruction padding in the ALTERNATIVE_2 macro: if len(orig_insn) < len(alt1) < len(alt2), then not enough padding gets added and that is not good(tm) as we could overwrite the beginning of the next instruction. Luckily, at the time of this writing, we don't have ALTERNATIVE_2() invocations which have that problem and even if we did, a simple fix would be to prepend the instructions with enough prefixes so that that corner case doesn't happen. However, best it would be if we fixed it properly. See below for a simple, abstracted example of what we're doing. So what we ended up doing is, we compute the max(len(alt1), len(alt2)) - len(orig_insn) and feed that value to the .skip gas directive. The max() cannot have conditionals due to gas limitations, thus the fancy integer math. With this patch, all ALTERNATIVE_2 sites get padded correctly; generating obscure test cases pass too: #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) #define gen_skip(orig, alt1, alt2, marker) \ .skip -((alt_max_short(alt1, alt2) - (orig)) > 0) * \ (alt_max_short(alt1, alt2) - (orig)),marker .pushsection .text, "ax" .globl main main: gen_skip(1, 2, 4, 0x09) gen_skip(4, 1, 2, 0x10) ... .popsection Thanks to Quentin for catching it and double-checking the fix! Reported-by: Quentin Casasnovas Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150404133443.GE21152@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative-asm.h | 14 ++++++++++++-- arch/x86/include/asm/alternative.h | 16 ++++++++++++---- arch/x86/kernel/alternative.c | 4 ++-- 3 files changed, 26 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 524bddce0b76..bdf02eeee765 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -45,12 +45,22 @@ .popsection .endm +#define old_len 141b-140b +#define new_len1 144f-143f +#define new_len2 145f-144f + +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + */ +#define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) + .macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 140: \oldinstr 141: - .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 - .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90 + .skip -((alt_max_short(new_len1, new_len2) - (old_len)) > 0) * \ + (alt_max_short(new_len1, new_len2) - (old_len)),0x90 142: .pushsection .altinstructions,"a" diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 5aef6a97d80e..ba32af062f61 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -95,14 +95,22 @@ static inline int alternatives_text_reserved(void *start, void *end) __OLDINSTR(oldinstr, num) \ alt_end_marker ":\n" +/* + * max without conditionals. Idea adapted from: + * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas works with s32s. + */ +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" + /* * Pad the second replacement alternative with additional NOPs if it is * additionally longer than the first replacement alternative. */ -#define OLDINSTR_2(oldinstr, num1, num2) \ - __OLDINSTR(oldinstr, num1) \ - ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \ - "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \ +#define OLDINSTR_2(oldinstr, num1, num2) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -((" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")) > 0) * " \ + "(" alt_max_short(alt_rlen(num1), alt_rlen(num2)) " - (" alt_slen ")), 0x90\n" \ alt_end_marker ":\n" #define ALTINSTR_ENTRY(feature, num) \ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 5c993c94255e..7c4ad005d7a0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -369,11 +369,11 @@ void __init_or_module apply_alternatives(struct alt_instr *start, continue; } - DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)", + DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", a->cpuid >> 5, a->cpuid & 0x1f, instr, a->instrlen, - replacement, a->replacementlen); + replacement, a->replacementlen, a->padlen); DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); From 6a3713f001b3b53587e411ab0d3036ae9b0fb93b Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 4 Apr 2015 08:58:23 -0400 Subject: [PATCH 129/148] x86/signal: Remove pax argument from restore_sigcontext The 'pax' argument is unnecesary. Instead, store the RAX value directly in regs. This pattern goes all the way back to 2.1.106pre1, when restore_sigcontext() was changed to return an error code instead of EAX directly: https://git.kernel.org/cgit/linux/kernel/git/history/history.git/diff/arch/i386/kernel/signal.c?id=9a8f8b7ca3f319bd668298d447bdf32730e51174 In 2007 sigaltstack syscall support was added, where the return value of restore_sigcontext() was changed to carry the memory-copying failure code. But instead of putting 'ax' into regs->ax directly, it was carried in via a pointer and then returned, where the generic syscall return code copied it to regs->ax. So there was never any deeper reason for this suboptimal pattern, it was simply never noticed after being introduced. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1428152303-17154-1-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32_signal.c | 17 ++++++----------- arch/x86/include/asm/sighandling.h | 4 +--- arch/x86/kernel/signal.c | 22 ++++++++-------------- 3 files changed, 15 insertions(+), 28 deletions(-) diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 1f5e2b0e09ff..c81d35e6c7f1 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -161,8 +161,7 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) } static int ia32_restore_sigcontext(struct pt_regs *regs, - struct sigcontext_ia32 __user *sc, - unsigned int *pax) + struct sigcontext_ia32 __user *sc) { unsigned int tmpflags, err = 0; void __user *buf; @@ -184,7 +183,7 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, RELOAD_SEG(es); COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); + COPY(dx); COPY(cx); COPY(ip); COPY(ax); /* Don't touch extended registers */ COPY_SEG_CPL3(cs); @@ -197,8 +196,6 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, get_user_ex(tmp, &sc->fpstate); buf = compat_ptr(tmp); - - get_user_ex(*pax, &sc->ax); } get_user_catch(err); err |= restore_xstate_sig(buf, 1); @@ -213,7 +210,6 @@ asmlinkage long sys32_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct sigframe_ia32 __user *frame = (struct sigframe_ia32 __user *)(regs->sp-8); sigset_t set; - unsigned int ax; if (!access_ok(VERIFY_READ, frame, sizeof(*frame))) goto badframe; @@ -226,9 +222,9 @@ asmlinkage long sys32_sigreturn(void) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->sc, &ax)) + if (ia32_restore_sigcontext(regs, &frame->sc)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "32bit sigreturn"); @@ -240,7 +236,6 @@ asmlinkage long sys32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_ia32 __user *frame; sigset_t set; - unsigned int ax; frame = (struct rt_sigframe_ia32 __user *)(regs->sp - 4); @@ -251,13 +246,13 @@ asmlinkage long sys32_rt_sigreturn(void) set_current_blocked(&set); - if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (ia32_restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "32bit rt sigreturn"); diff --git a/arch/x86/include/asm/sighandling.h b/arch/x86/include/asm/sighandling.h index 7a958164088c..89db46752a8f 100644 --- a/arch/x86/include/asm/sighandling.h +++ b/arch/x86/include/asm/sighandling.h @@ -13,9 +13,7 @@ X86_EFLAGS_CF | X86_EFLAGS_RF) void signal_fault(struct pt_regs *regs, void __user *frame, char *where); - -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax); +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc); int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, struct pt_regs *regs, unsigned long mask); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index eaa2c5e3f2cd..53cc4085c3d7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,8 +61,7 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { void __user *buf; unsigned int tmpflags; @@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, #endif /* CONFIG_X86_32 */ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); + COPY(dx); COPY(cx); COPY(ip); COPY(ax); #ifdef CONFIG_X86_64 COPY(r8); @@ -102,8 +101,6 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, regs->orig_ax = -1; /* disable syscall checks */ get_user_ex(buf, &sc->fpstate); - - get_user_ex(*pax, &sc->ax); } get_user_catch(err); err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); @@ -545,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; - unsigned long ax; sigset_t set; frame = (struct sigframe __user *)(regs->sp - 8); @@ -559,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc, &ax)) + if (restore_sigcontext(regs, &frame->sc)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "sigreturn"); @@ -574,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; - unsigned long ax; sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); @@ -585,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "rt_sigreturn"); @@ -786,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; - unsigned long ax; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -797,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "x32 rt_sigreturn"); From fc3e958a2b552fe6210e6de3bebb4348156a0b5f Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Sat, 4 Apr 2015 20:55:19 +0200 Subject: [PATCH 130/148] x86/asm/entry: Clear EXTRA_REGS for all executable formats On failure, sys_execve() does not clobber EXTRA_REGS, so we can just return to userpsace without saving/restoring them. On success, ELF_PLAT_INIT() in sys_execve() clears all these registers. On other executable formats: - binfmt_flat.c has similar FLAT_PLAT_INIT, but x86 (and everyone else except sh) doesn't define it. - binfmt_elf_fdpic.c has ELF_FDPIC_PLAT_INIT, but x86 (and most others) doesn't define it. - There are no such hooks in binfmt_aout.c et al. We inherit EXTRA_REGS from the prior executable. This inconsistency was not intended. This change removes SAVE/RESTORE_EXTRA_REGS in stub_execve, removes register clearing in ELF_PLAT_INIT(), and instead simply clears them on success in stub_execve. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1428173719-7637-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/calling.h | 9 ++++++ arch/x86/include/asm/elf.h | 7 +++-- arch/x86/kernel/entry_64.S | 50 +++++++++++++++------------------- 3 files changed, 35 insertions(+), 31 deletions(-) diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 4b5f7bf2b780..1c8b50edb2db 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -151,6 +151,15 @@ For 32-bit we have the following conventions - kernel is built with movq_cfi_restore 5*8+\offset, rbx .endm + .macro ZERO_EXTRA_REGS + xorl %r15d, %r15d + xorl %r14d, %r14d + xorl %r13d, %r13d + xorl %r12d, %r12d + xorl %ebp, %ebp + xorl %ebx, %ebx + .endm + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq_cfi_restore 6*8, r11 diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index ca3347a9dab5..3563107b5060 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,10 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - regs->ax = regs->bx = regs->cx = regs->dx = 0; - regs->si = regs->di = regs->bp = 0; + /* Commented-out registers are cleared in stub_execve */ + /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; + regs->si = regs->di /*= regs->bp*/ = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; + /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 386375d43d14..f4270ff73f2a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -419,25 +419,27 @@ END(stub_\func) ENTRY(stub_execve) CFI_STARTPROC - addq $8, %rsp - DEFAULT_FRAME 0 - SAVE_EXTRA_REGS - call sys_execve - movq %rax,RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call sys_execve +return_from_execve: + testl %eax, %eax + jz 1f + /* exec failed, can use fast SYSRET code path in this case */ + ret +1: + /* must use IRET code path (pt_regs->cs may have changed) */ + addq $8, %rsp + ZERO_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) ENTRY(stub_execveat) CFI_STARTPROC - addq $8, %rsp - DEFAULT_FRAME 0 - SAVE_EXTRA_REGS - call sys_execveat - movq %rax,RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call sys_execveat + jmp return_from_execve CFI_ENDPROC END(stub_execveat) @@ -472,25 +474,17 @@ END(stub_x32_rt_sigreturn) ENTRY(stub_x32_execve) CFI_STARTPROC - addq $8, %rsp - DEFAULT_FRAME 0 - SAVE_EXTRA_REGS - call compat_sys_execve - movq %rax,RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call compat_sys_execve + jmp return_from_execve CFI_ENDPROC END(stub_x32_execve) ENTRY(stub_x32_execveat) CFI_STARTPROC - addq $8, %rsp - DEFAULT_FRAME 0 - SAVE_EXTRA_REGS - call compat_sys_execveat - movq %rax,RAX(%rsp) - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call compat_sys_execveat + jmp return_from_execve CFI_ENDPROC END(stub_x32_execveat) From 69df353ff305805fc16082d0c5bfa6e20fa8b863 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 4 Apr 2015 23:07:42 +0200 Subject: [PATCH 131/148] x86/alternatives: Guard NOPs optimization Take a look at the first instruction byte before optimizing the NOP - there might be something else there already, like the ALTERNATIVE_2() in rdtsc_barrier() which NOPs out on AMD even though we just patched in an MFENCE. This happens because the alternatives sees X86_FEATURE_MFENCE_RDTSC, AMD CPUs set it, we patch in the MFENCE and right afterwards it sees X86_FEATURE_LFENCE_RDTSC which AMD CPUs don't set and we blindly optimize the NOP. Checking whether at least the first byte is 0x90 prevents that. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1428181662-18020-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7c4ad005d7a0..aef653193160 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -325,6 +325,9 @@ done: static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) { + if (instr[0] != 0x90) + return; + add_nops(instr + (a->instrlen - a->padlen), a->padlen); DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", From 3f705dfdf85a6416f5f12e52b7610144a99cbedc Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 6 Apr 2015 23:11:06 -0700 Subject: [PATCH 132/148] x86, selftests: Add sigreturn selftest This is my sigreturn test, added mostly unchanged from its old home. It exercises the sigreturn(2) syscall, specifically focusing on its interactions with various IRET corner cases. It tests for correct behavior in several areas that were historically dangerously buggy. For example, it exercises espfix on kernels of both bitnesses under various conditions, and it contains testcases for several now-fixed bugs in IRET error handling. If you run it on older kernels without the fixes, your system will crash. It probably won't eat your data in the process. There is no released kernel on which the sigreturn_64 test will pass, but it passes on tip:x86/asm. I plan to switch to lib.mk for Linux 4.2. I'm not using the ksft_ helpers at all yet. I can do that later. Signed-off-by: Andy Lutomirski Acked-by: Shuah Khan Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Michael Ellerman Cc: Shuah Khan Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/89d10b76b92c7202d8123654dc8d36701c017b3d.1428386971.git.luto@kernel.org [ Fixed empty format string GCC build warning in trivial_32bit_program.c ] Signed-off-by: Ingo Molnar --- tools/testing/selftests/Makefile | 1 + tools/testing/selftests/x86/.gitignore | 2 + tools/testing/selftests/x86/Makefile | 48 ++ tools/testing/selftests/x86/run_x86_tests.sh | 11 + tools/testing/selftests/x86/sigreturn.c | 684 ++++++++++++++++++ .../selftests/x86/trivial_32bit_program.c | 14 + 6 files changed, 760 insertions(+) create mode 100644 tools/testing/selftests/x86/.gitignore create mode 100644 tools/testing/selftests/x86/Makefile create mode 100644 tools/testing/selftests/x86/run_x86_tests.sh create mode 100644 tools/testing/selftests/x86/sigreturn.c create mode 100644 tools/testing/selftests/x86/trivial_32bit_program.c diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile index 4e511221a0c1..2ad56d451469 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile @@ -17,6 +17,7 @@ TARGETS += sysctl TARGETS += timers TARGETS += user TARGETS += vm +TARGETS += x86 #Please keep the TARGETS list alphabetically sorted TARGETS_HOTPLUG = cpu-hotplug diff --git a/tools/testing/selftests/x86/.gitignore b/tools/testing/selftests/x86/.gitignore new file mode 100644 index 000000000000..15034fef9698 --- /dev/null +++ b/tools/testing/selftests/x86/.gitignore @@ -0,0 +1,2 @@ +*_32 +*_64 diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile new file mode 100644 index 000000000000..f0a7918178dd --- /dev/null +++ b/tools/testing/selftests/x86/Makefile @@ -0,0 +1,48 @@ +.PHONY: all all_32 all_64 check_build32 clean run_tests + +TARGETS_C_BOTHBITS := sigreturn + +BINARIES_32 := $(TARGETS_C_BOTHBITS:%=%_32) +BINARIES_64 := $(TARGETS_C_BOTHBITS:%=%_64) + +CFLAGS := -O2 -g -std=gnu99 -pthread -Wall + +UNAME_P := $(shell uname -p) + +# Always build 32-bit tests +all: all_32 + +# If we're on a 64-bit host, build 64-bit tests as well +ifeq ($(shell uname -p),x86_64) +all: all_64 +endif + +all_32: check_build32 $(BINARIES_32) + +all_64: $(BINARIES_64) + +clean: + $(RM) $(BINARIES_32) $(BINARIES_64) + +run_tests: + ./run_x86_tests.sh + +$(TARGETS_C_BOTHBITS:%=%_32): %_32: %.c + $(CC) -m32 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl + +$(TARGETS_C_BOTHBITS:%=%_64): %_64: %.c + $(CC) -m64 -o $@ $(CFLAGS) $(EXTRA_CFLAGS) $^ -lrt -ldl + +check_build32: + @if ! $(CC) -m32 -o /dev/null trivial_32bit_program.c; then \ + echo "Warning: you seem to have a broken 32-bit build" 2>&1; \ + echo "environment. If you are using a Debian-like"; \ + echo " distribution, try:"; \ + echo ""; \ + echo " apt-get install gcc-multilib libc6-i386 libc6-dev-i386"; \ + echo ""; \ + echo "If you are using a Fedora-like distribution, try:"; \ + echo ""; \ + echo " yum install glibc-devel.*i686"; \ + exit 1; \ + fi diff --git a/tools/testing/selftests/x86/run_x86_tests.sh b/tools/testing/selftests/x86/run_x86_tests.sh new file mode 100644 index 000000000000..3d3ec65f3e7c --- /dev/null +++ b/tools/testing/selftests/x86/run_x86_tests.sh @@ -0,0 +1,11 @@ +#!/bin/bash + +# This is deliberately minimal. IMO kselftests should provide a standard +# script here. +./sigreturn_32 || exit 1 + +if [[ "$uname -p" -eq "x86_64" ]]; then + ./sigreturn_64 || exit 1 +fi + +exit 0 diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c new file mode 100644 index 000000000000..b5aa1bab7416 --- /dev/null +++ b/tools/testing/selftests/x86/sigreturn.c @@ -0,0 +1,684 @@ +/* + * sigreturn.c - tests for x86 sigreturn(2) and exit-to-userspace + * Copyright (c) 2014-2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This is a series of tests that exercises the sigreturn(2) syscall and + * the IRET / SYSRET paths in the kernel. + * + * For now, this focuses on the effects of unusual CS and SS values, + * and it has a bunch of tests to make sure that ESP/RSP is restored + * properly. + * + * The basic idea behind these tests is to raise(SIGUSR1) to create a + * sigcontext frame, plug in the values to be tested, and then return, + * which implicitly invokes sigreturn(2) and programs the user context + * as desired. + * + * For tests for which we expect sigreturn and the subsequent return to + * user mode to succeed, we return to a short trampoline that generates + * SIGTRAP so that the meat of the tests can be ordinary C code in a + * SIGTRAP handler. + * + * The inner workings of each test is documented below. + * + * Do not run on outdated, unpatched kernels at risk of nasty crashes. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * In principle, this test can run on Linux emulation layers (e.g. + * Illumos "LX branded zones"). Solaris-based kernels reserve LDT + * entries 0-5 for their own internal purposes, so start our LDT + * allocations above that reservation. (The tests don't pass on LX + * branded zones, but at least this lets them run.) + */ +#define LDT_OFFSET 6 + +/* An aligned stack accessible through some of our segments. */ +static unsigned char stack16[65536] __attribute__((aligned(4096))); + +/* + * An aligned int3 instruction used as a trampoline. Some of the tests + * want to fish out their ss values, so this trampoline copies ss to eax + * before the int3. + */ +asm (".pushsection .text\n\t" + ".type int3, @function\n\t" + ".align 4096\n\t" + "int3:\n\t" + "mov %ss,%eax\n\t" + "int3\n\t" + ".size int3, . - int3\n\t" + ".align 4096, 0xcc\n\t" + ".popsection"); +extern char int3[4096]; + +/* + * At startup, we prepapre: + * + * - ldt_nonexistent_sel: An LDT entry that doesn't exist (all-zero + * descriptor or out of bounds). + * - code16_sel: A 16-bit LDT code segment pointing to int3. + * - data16_sel: A 16-bit LDT data segment pointing to stack16. + * - npcode32_sel: A 32-bit not-present LDT code segment pointing to int3. + * - npdata32_sel: A 32-bit not-present LDT data segment pointing to stack16. + * - gdt_data16_idx: A 16-bit GDT data segment pointing to stack16. + * - gdt_npdata32_idx: A 32-bit not-present GDT data segment pointing to + * stack16. + * + * For no particularly good reason, xyz_sel is a selector value with the + * RPL and LDT bits filled in, whereas xyz_idx is just an index into the + * descriptor table. These variables will be zero if their respective + * segments could not be allocated. + */ +static unsigned short ldt_nonexistent_sel; +static unsigned short code16_sel, data16_sel, npcode32_sel, npdata32_sel; + +static unsigned short gdt_data16_idx, gdt_npdata32_idx; + +static unsigned short GDT3(int idx) +{ + return (idx << 3) | 3; +} + +static unsigned short LDT3(int idx) +{ + return (idx << 3) | 7; +} + +/* Our sigaltstack scratch space. */ +static char altstack_data[SIGSTKSZ]; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void add_ldt(const struct user_desc *desc, unsigned short *var, + const char *name) +{ + if (syscall(SYS_modify_ldt, 1, desc, sizeof(*desc)) == 0) { + *var = LDT3(desc->entry_number); + } else { + printf("[NOTE]\tFailed to create %s segment\n", name); + *var = 0; + } +} + +static void setup_ldt(void) +{ + if ((unsigned long)stack16 > (1ULL << 32) - sizeof(stack16)) + errx(1, "stack16 is too high\n"); + if ((unsigned long)int3 > (1ULL << 32) - sizeof(int3)) + errx(1, "int3 is too high\n"); + + ldt_nonexistent_sel = LDT3(LDT_OFFSET + 2); + + const struct user_desc code16_desc = { + .entry_number = LDT_OFFSET + 0, + .base_addr = (unsigned long)int3, + .limit = 4095, + .seg_32bit = 0, + .contents = 2, /* Code, not conforming */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 0, + .useable = 0 + }; + add_ldt(&code16_desc, &code16_sel, "code16"); + + const struct user_desc data16_desc = { + .entry_number = LDT_OFFSET + 1, + .base_addr = (unsigned long)stack16, + .limit = 0xffff, + .seg_32bit = 0, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 0, + .useable = 0 + }; + add_ldt(&data16_desc, &data16_sel, "data16"); + + const struct user_desc npcode32_desc = { + .entry_number = LDT_OFFSET + 3, + .base_addr = (unsigned long)int3, + .limit = 4095, + .seg_32bit = 1, + .contents = 2, /* Code, not conforming */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 1, + .useable = 0 + }; + add_ldt(&npcode32_desc, &npcode32_sel, "npcode32"); + + const struct user_desc npdata32_desc = { + .entry_number = LDT_OFFSET + 4, + .base_addr = (unsigned long)stack16, + .limit = 0xffff, + .seg_32bit = 1, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 1, + .useable = 0 + }; + add_ldt(&npdata32_desc, &npdata32_sel, "npdata32"); + + struct user_desc gdt_data16_desc = { + .entry_number = -1, + .base_addr = (unsigned long)stack16, + .limit = 0xffff, + .seg_32bit = 0, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 0, + .useable = 0 + }; + + if (syscall(SYS_set_thread_area, &gdt_data16_desc) == 0) { + /* + * This probably indicates vulnerability to CVE-2014-8133. + * Merely getting here isn't definitive, though, and we'll + * diagnose the problem for real later on. + */ + printf("[WARN]\tset_thread_area allocated data16 at index %d\n", + gdt_data16_desc.entry_number); + gdt_data16_idx = gdt_data16_desc.entry_number; + } else { + printf("[OK]\tset_thread_area refused 16-bit data\n"); + } + + struct user_desc gdt_npdata32_desc = { + .entry_number = -1, + .base_addr = (unsigned long)stack16, + .limit = 0xffff, + .seg_32bit = 1, + .contents = 0, /* Data, grow-up */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 1, + .useable = 0 + }; + + if (syscall(SYS_set_thread_area, &gdt_npdata32_desc) == 0) { + /* + * As a hardening measure, newer kernels don't allow this. + */ + printf("[WARN]\tset_thread_area allocated npdata32 at index %d\n", + gdt_npdata32_desc.entry_number); + gdt_npdata32_idx = gdt_npdata32_desc.entry_number; + } else { + printf("[OK]\tset_thread_area refused 16-bit data\n"); + } +} + +/* State used by our signal handlers. */ +static gregset_t initial_regs, requested_regs, resulting_regs; + +/* Instructions for the SIGUSR1 handler. */ +static volatile unsigned short sig_cs, sig_ss; +static volatile sig_atomic_t sig_trapped, sig_err, sig_trapno; + +/* Abstractions for some 32-bit vs 64-bit differences. */ +#ifdef __x86_64__ +# define REG_IP REG_RIP +# define REG_SP REG_RSP +# define REG_AX REG_RAX + +struct selectors { + unsigned short cs, gs, fs, ss; +}; + +static unsigned short *ssptr(ucontext_t *ctx) +{ + struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS]; + return &sels->ss; +} + +static unsigned short *csptr(ucontext_t *ctx) +{ + struct selectors *sels = (void *)&ctx->uc_mcontext.gregs[REG_CSGSFS]; + return &sels->cs; +} +#else +# define REG_IP REG_EIP +# define REG_SP REG_ESP +# define REG_AX REG_EAX + +static greg_t *ssptr(ucontext_t *ctx) +{ + return &ctx->uc_mcontext.gregs[REG_SS]; +} + +static greg_t *csptr(ucontext_t *ctx) +{ + return &ctx->uc_mcontext.gregs[REG_CS]; +} +#endif + +/* Number of errors in the current test case. */ +static volatile sig_atomic_t nerrs; + +/* + * SIGUSR1 handler. Sets CS and SS as requested and points IP to the + * int3 trampoline. Sets SP to a large known value so that we can see + * whether the value round-trips back to user mode correctly. + */ +static void sigusr1(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + memcpy(&initial_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + + *csptr(ctx) = sig_cs; + *ssptr(ctx) = sig_ss; + + ctx->uc_mcontext.gregs[REG_IP] = + sig_cs == code16_sel ? 0 : (unsigned long)&int3; + ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; + ctx->uc_mcontext.gregs[REG_AX] = 0; + + memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */ + + return; +} + +/* + * Called after a successful sigreturn. Restores our state so that + * the original raise(SIGUSR1) returns. + */ +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + sig_err = ctx->uc_mcontext.gregs[REG_ERR]; + sig_trapno = ctx->uc_mcontext.gregs[REG_TRAPNO]; + + unsigned short ss; + asm ("mov %%ss,%0" : "=r" (ss)); + + greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX]; + if (asm_ss != sig_ss && sig == SIGTRAP) { + /* Sanity check failure. */ + printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n", + ss, *ssptr(ctx), (unsigned long long)asm_ss); + nerrs++; + } + + memcpy(&resulting_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); + memcpy(&ctx->uc_mcontext.gregs, &initial_regs, sizeof(gregset_t)); + + sig_trapped = sig; +} + +/* + * Checks a given selector for its code bitness or returns -1 if it's not + * a usable code segment selector. + */ +int cs_bitness(unsigned short cs) +{ + uint32_t valid = 0, ar; + asm ("lar %[cs], %[ar]\n\t" + "jnz 1f\n\t" + "mov $1, %[valid]\n\t" + "1:" + : [ar] "=r" (ar), [valid] "+rm" (valid) + : [cs] "r" (cs)); + + if (!valid) + return -1; + + bool db = (ar & (1 << 22)); + bool l = (ar & (1 << 21)); + + if (!(ar & (1<<11))) + return -1; /* Not code. */ + + if (l && !db) + return 64; + else if (!l && db) + return 32; + else if (!l && !db) + return 16; + else + return -1; /* Unknown bitness. */ +} + +/* Finds a usable code segment of the requested bitness. */ +int find_cs(int bitness) +{ + unsigned short my_cs; + + asm ("mov %%cs,%0" : "=r" (my_cs)); + + if (cs_bitness(my_cs) == bitness) + return my_cs; + if (cs_bitness(my_cs + (2 << 3)) == bitness) + return my_cs + (2 << 3); + if (my_cs > (2<<3) && cs_bitness(my_cs - (2 << 3)) == bitness) + return my_cs - (2 << 3); + if (cs_bitness(code16_sel) == bitness) + return code16_sel; + + printf("[WARN]\tCould not find %d-bit CS\n", bitness); + return -1; +} + +static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) +{ + int cs = find_cs(cs_bits); + if (cs == -1) { + printf("[SKIP]\tCode segment unavailable for %d-bit CS, %d-bit SS\n", + cs_bits, use_16bit_ss ? 16 : 32); + return 0; + } + + if (force_ss != -1) { + sig_ss = force_ss; + } else { + if (use_16bit_ss) { + if (!data16_sel) { + printf("[SKIP]\tData segment unavailable for %d-bit CS, 16-bit SS\n", + cs_bits); + return 0; + } + sig_ss = data16_sel; + } else { + asm volatile ("mov %%ss,%0" : "=r" (sig_ss)); + } + } + + sig_cs = cs; + + printf("[RUN]\tValid sigreturn: %d-bit CS (%hx), %d-bit SS (%hx%s)\n", + cs_bits, sig_cs, use_16bit_ss ? 16 : 32, sig_ss, + (sig_ss & 4) ? "" : ", GDT"); + + raise(SIGUSR1); + + nerrs = 0; + + /* + * Check that each register had an acceptable value when the + * int3 trampoline was invoked. + */ + for (int i = 0; i < NGREG; i++) { + greg_t req = requested_regs[i], res = resulting_regs[i]; + if (i == REG_TRAPNO || i == REG_IP) + continue; /* don't care */ + if (i == REG_SP) { + printf("\tSP: %llx -> %llx\n", (unsigned long long)req, + (unsigned long long)res); + + /* + * In many circumstances, the high 32 bits of rsp + * are zeroed. For example, we could be a real + * 32-bit program, or we could hit any of a number + * of poorly-documented IRET or segmented ESP + * oddities. If this happens, it's okay. + */ + if (res == (req & 0xFFFFFFFF)) + continue; /* OK; not expected to work */ + } + + bool ignore_reg = false; +#if __i386__ + if (i == REG_UESP) + ignore_reg = true; +#else + if (i == REG_CSGSFS) { + struct selectors *req_sels = + (void *)&requested_regs[REG_CSGSFS]; + struct selectors *res_sels = + (void *)&resulting_regs[REG_CSGSFS]; + if (req_sels->cs != res_sels->cs) { + printf("[FAIL]\tCS mismatch: requested 0x%hx; got 0x%hx\n", + req_sels->cs, res_sels->cs); + nerrs++; + } + + if (req_sels->ss != res_sels->ss) { + printf("[FAIL]\tSS mismatch: requested 0x%hx; got 0x%hx\n", + req_sels->ss, res_sels->ss); + nerrs++; + } + + continue; + } +#endif + + /* Sanity check on the kernel */ + if (i == REG_AX && requested_regs[i] != resulting_regs[i]) { + printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", + (unsigned long long)requested_regs[i], + (unsigned long long)resulting_regs[i]); + nerrs++; + continue; + } + + if (requested_regs[i] != resulting_regs[i] && !ignore_reg) { + /* + * SP is particularly interesting here. The + * usual cause of failures is that we hit the + * nasty IRET case of returning to a 16-bit SS, + * in which case bits 16:31 of the *kernel* + * stack pointer persist in ESP. + */ + printf("[FAIL]\tReg %d mismatch: requested 0x%llx; got 0x%llx\n", + i, (unsigned long long)requested_regs[i], + (unsigned long long)resulting_regs[i]); + nerrs++; + } + } + + if (nerrs == 0) + printf("[OK]\tall registers okay\n"); + + return nerrs; +} + +static int test_bad_iret(int cs_bits, unsigned short ss, int force_cs) +{ + int cs = force_cs == -1 ? find_cs(cs_bits) : force_cs; + if (cs == -1) + return 0; + + sig_cs = cs; + sig_ss = ss; + + printf("[RUN]\t%d-bit CS (%hx), bogus SS (%hx)\n", + cs_bits, sig_cs, sig_ss); + + sig_trapped = 0; + raise(SIGUSR1); + if (sig_trapped) { + char errdesc[32] = ""; + if (sig_err) { + const char *src = (sig_err & 1) ? " EXT" : ""; + const char *table; + if ((sig_err & 0x6) == 0x0) + table = "GDT"; + else if ((sig_err & 0x6) == 0x4) + table = "LDT"; + else if ((sig_err & 0x6) == 0x2) + table = "IDT"; + else + table = "???"; + + sprintf(errdesc, "%s%s index %d, ", + table, src, sig_err >> 3); + } + + char trapname[32]; + if (sig_trapno == 13) + strcpy(trapname, "GP"); + else if (sig_trapno == 11) + strcpy(trapname, "NP"); + else if (sig_trapno == 12) + strcpy(trapname, "SS"); + else if (sig_trapno == 32) + strcpy(trapname, "IRET"); /* X86_TRAP_IRET */ + else + sprintf(trapname, "%d", sig_trapno); + + printf("[OK]\tGot #%s(0x%lx) (i.e. %s%s)\n", + trapname, (unsigned long)sig_err, + errdesc, strsignal(sig_trapped)); + return 0; + } else { + printf("[FAIL]\tDid not get SIGSEGV\n"); + return 1; + } +} + +int main() +{ + int total_nerrs = 0; + unsigned short my_cs, my_ss; + + asm volatile ("mov %%cs,%0" : "=r" (my_cs)); + asm volatile ("mov %%ss,%0" : "=r" (my_ss)); + setup_ldt(); + + stack_t stack = { + .ss_sp = altstack_data, + .ss_size = SIGSTKSZ, + }; + if (sigaltstack(&stack, NULL) != 0) + err(1, "sigaltstack"); + + sethandler(SIGUSR1, sigusr1, 0); + sethandler(SIGTRAP, sigtrap, SA_ONSTACK); + + /* Easy cases: return to a 32-bit SS in each possible CS bitness. */ + total_nerrs += test_valid_sigreturn(64, false, -1); + total_nerrs += test_valid_sigreturn(32, false, -1); + total_nerrs += test_valid_sigreturn(16, false, -1); + + /* + * Test easy espfix cases: return to a 16-bit LDT SS in each possible + * CS bitness. NB: with a long mode CS, the SS bitness is irrelevant. + * + * This catches the original missing-espfix-on-64-bit-kernels issue + * as well as CVE-2014-8134. + */ + total_nerrs += test_valid_sigreturn(64, true, -1); + total_nerrs += test_valid_sigreturn(32, true, -1); + total_nerrs += test_valid_sigreturn(16, true, -1); + + if (gdt_data16_idx) { + /* + * For performance reasons, Linux skips espfix if SS points + * to the GDT. If we were able to allocate a 16-bit SS in + * the GDT, see if it leaks parts of the kernel stack pointer. + * + * This tests for CVE-2014-8133. + */ + total_nerrs += test_valid_sigreturn(64, true, + GDT3(gdt_data16_idx)); + total_nerrs += test_valid_sigreturn(32, true, + GDT3(gdt_data16_idx)); + total_nerrs += test_valid_sigreturn(16, true, + GDT3(gdt_data16_idx)); + } + + /* + * We're done testing valid sigreturn cases. Now we test states + * for which sigreturn itself will succeed but the subsequent + * entry to user mode will fail. + * + * Depending on the failure mode and the kernel bitness, these + * entry failures can generate SIGSEGV, SIGBUS, or SIGILL. + */ + clearhandler(SIGTRAP); + sethandler(SIGSEGV, sigtrap, SA_ONSTACK); + sethandler(SIGBUS, sigtrap, SA_ONSTACK); + sethandler(SIGILL, sigtrap, SA_ONSTACK); /* 32-bit kernels do this */ + + /* Easy failures: invalid SS, resulting in #GP(0) */ + test_bad_iret(64, ldt_nonexistent_sel, -1); + test_bad_iret(32, ldt_nonexistent_sel, -1); + test_bad_iret(16, ldt_nonexistent_sel, -1); + + /* These fail because SS isn't a data segment, resulting in #GP(SS) */ + test_bad_iret(64, my_cs, -1); + test_bad_iret(32, my_cs, -1); + test_bad_iret(16, my_cs, -1); + + /* Try to return to a not-present code segment, triggering #NP(SS). */ + test_bad_iret(32, my_ss, npcode32_sel); + + /* + * Try to return to a not-present but otherwise valid data segment. + * This will cause IRET to fail with #SS on the espfix stack. This + * exercises CVE-2014-9322. + * + * Note that, if espfix is enabled, 64-bit Linux will lose track + * of the actual cause of failure and report #GP(0) instead. + * This would be very difficult for Linux to avoid, because + * espfix64 causes IRET failures to be promoted to #DF, so the + * original exception frame is never pushed onto the stack. + */ + test_bad_iret(32, npdata32_sel, -1); + + /* + * Try to return to a not-present but otherwise valid data + * segment without invoking espfix. Newer kernels don't allow + * this to happen in the first place. On older kernels, though, + * this can trigger CVE-2014-9322. + */ + if (gdt_npdata32_idx) + test_bad_iret(32, GDT3(gdt_npdata32_idx), -1); + + return total_nerrs ? 1 : 0; +} diff --git a/tools/testing/selftests/x86/trivial_32bit_program.c b/tools/testing/selftests/x86/trivial_32bit_program.c new file mode 100644 index 000000000000..2e231beb0a39 --- /dev/null +++ b/tools/testing/selftests/x86/trivial_32bit_program.c @@ -0,0 +1,14 @@ +/* + * Trivial program to check that we have a valid 32-bit build environment. + * Copyright (c) 2015 Andy Lutomirski + * GPL v2 + */ + +#include + +int main() +{ + printf("\n"); + + return 0; +} From fffbb5dcfd29f8831e41b4dd2ab938bd36d35283 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 2 Apr 2015 18:46:59 +0200 Subject: [PATCH 133/148] x86/asm/entry/64: Move opportunistic sysret code to syscall code path This change does two things: Copy-pastes "retint_swapgs:" code into syscall handling code, the copy is under "syscall_return:" label. The code is unchanged apart from some label renames. Removes "opportunistic sysret" code from "retint_swapgs:" code block, since now it won't be reached by syscall return. This in fact removes most of the code in question. text data bss dec hex filename 12530 0 0 12530 30f2 entry_64.o.before 12562 0 0 12562 3112 entry_64.o Run-tested. Acked-and-Tested-by: Borislav Petkov Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1427993219-7291-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 158 ++++++++++++++++++++----------------- 1 file changed, 86 insertions(+), 72 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 65485b3baa59..e4c810395bae 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -354,8 +354,8 @@ GLOBAL(int_with_check) movl TI_flags(%rcx),%edx andl %edi,%edx jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp retint_swapgs + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return /* Either reschedule or signal or syscall exit tracking needed. */ /* First do a reschedule test. */ @@ -399,9 +399,86 @@ int_restore_rest: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check + +syscall_return: + /* The IRETQ could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. It's not worth + * testing for canonicalness exactly -- this check detects any + * of the 17 high bits set, which is true for non-canonical + * or kernel addresses. (This will pessimize vsyscall=native. + * Big deal.) + * + * If virtual addresses ever become wider, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- SYSRET checks need update" + .endif + shr $__VIRTUAL_MASK_SHIFT, %rcx + jnz opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + CFI_REMEMBER_STATE + /* r11 is already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + CFI_RESTORE_STATE + +opportunistic_sysret_failed: + SWAPGS + jmp restore_c_regs_and_iret CFI_ENDPROC END(system_call) + .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC @@ -673,76 +750,8 @@ retint_swapgs: /* return to user-space */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq RCX(%rsp),%rcx - cmpq %rcx,RIP(%rsp) /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) - * - * If virtual addresses ever become wider, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- sysret checks need update" - .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed - - cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq R11(%rsp),%r11 - cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - /* - * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, - * restoring TF results in a trap from userspace immediately after - * SYSRET. This would cause an infinite loop whenever #DB happens - * with register state that satisfies the opportunistic SYSRET - * conditions. For example, single-stepping this user code: - * - * movq $stuck_here,%rcx - * pushfq - * popq %r11 - * stuck_here: - * - * would never get past 'stuck_here'. - */ - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -irq_return_via_sysret: - CFI_REMEMBER_STATE - /* r11 is already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_R11 - movq RSP(%rsp),%rsp - USERGS_SYSRET64 - CFI_RESTORE_STATE - -opportunistic_sysret_failed: SWAPGS - jmp restore_args + jmp restore_c_regs_and_iret /* Returning to kernel space */ retint_kernel: @@ -761,7 +770,12 @@ retint_kernel: * The iretq could re-enable interrupts: */ TRACE_IRQS_IRETQ -restore_args: + +/* + * At this label, code paths which return to kernel and to user, + * which come from interrupts/exception and from syscalls, merge. + */ +restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 From 3304c9c37bef30ebd2ef71d986e6568372ce80f8 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 3 Apr 2015 21:49:13 +0200 Subject: [PATCH 134/148] x86/asm/entry/irq: Simplify interrupt dispatch table (IDT) layout Interrupt entry points are handled with the following code, each 32-byte code block contains seven entry points: ... [push][jump 22] // 4 bytes [push][jump 18] // 4 bytes [push][jump 14] // 4 bytes [push][jump 10] // 4 bytes [push][jump 6] // 4 bytes [push][jump 2] // 4 bytes [push][jump common_interrupt][padding] // 8 bytes [push][jump] [push][jump] [push][jump] [push][jump] [push][jump] [push][jump] [push][jump common_interrupt][padding] [padding_2] common_interrupt: And there is a table which holds pointers to every entry point, IOW: to every push. In cold cache, two jumps are still costlier than one, even though we get the benefit of them residing in the same cacheline. This change replaces short jumps with near ones to 'common_interrupt', and pads every push+jump pair to 8 bytes. This way, each interrupt takes only one jump. This change replaces ".p2align CONFIG_X86_L1_CACHE_SHIFT" before dispatch table with ".align 8" - we do not need anything stronger than that. The table of entry addresses (the interrupt[] array) is no longer necessary, the address of entries can be easily calculated as (irq_entries_start + i*8). text data bss dec hex filename 12546 0 0 12546 3102 entry_64.o.before 11626 0 0 11626 2d6a entry_64.o The size decrease is because 1656 bytes of .init.rodata are gone. That's initdata, though. The resident size does go up a bit. Run-tested (32 and 64 bits). Acked-and-Tested-by: Borislav Petkov Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1428090553-7283-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/hw_irq.h | 5 ++--- arch/x86/kernel/entry_32.S | 41 +++++++++-------------------------- arch/x86/kernel/entry_64.S | 41 +++++++++-------------------------- arch/x86/kernel/irqinit.c | 3 ++- arch/x86/lguest/boot.c | 3 ++- 5 files changed, 26 insertions(+), 67 deletions(-) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 9662290e0b20..e9571ddabc4f 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -181,10 +181,9 @@ extern __visible void smp_call_function_single_interrupt(struct pt_regs *); extern __visible void smp_invalidate_interrupt(struct pt_regs *); #endif -extern void (*__initconst interrupt[FIRST_SYSTEM_VECTOR - - FIRST_EXTERNAL_VECTOR])(void); +extern char irq_entries_start[]; #ifdef CONFIG_TRACING -#define trace_interrupt interrupt +#define trace_irq_entries_start irq_entries_start #endif #define VECTOR_UNDEFINED (-1) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index effa2793feba..02bec0f1d1e1 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -723,43 +723,22 @@ END(sysenter_badsys) .endm /* - * Build the entry stubs and pointer table with some assembler magic. - * We pack 7 stubs into a single 32-byte chunk, which will fit in a - * single cache line on all modern x86 implementations. + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. */ -.section .init.rodata,"a" -ENTRY(interrupt) -.section .entry.text, "ax" - .p2align 5 - .p2align CONFIG_X86_L1_CACHE_SHIFT + .align 8 ENTRY(irq_entries_start) RING0_INT_FRAME -vector=FIRST_EXTERNAL_VECTOR -.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 - .balign 32 - .rept 7 - .if vector < FIRST_SYSTEM_VECTOR - .if vector <> FIRST_EXTERNAL_VECTOR + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 - jmp 2f - .endif - .previous - .long 1b - .section .entry.text, "ax" -vector=vector+1 - .endif - .endr -2: jmp common_interrupt -.endr + .align 8 + .endr END(irq_entries_start) -.previous -END(interrupt) -.previous - /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e4c810395bae..4ca03c518ab4 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -608,44 +608,23 @@ ENTRY(ret_from_fork) END(ret_from_fork) /* - * Build the entry stubs and pointer table with some assembler magic. - * We pack 7 stubs into a single 32-byte chunk, which will fit in a - * single cache line on all modern x86 implementations. + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. */ - .section .init.rodata,"a" -ENTRY(interrupt) - .section .entry.text - .p2align 5 - .p2align CONFIG_X86_L1_CACHE_SHIFT + .align 8 ENTRY(irq_entries_start) INTR_FRAME -vector=FIRST_EXTERNAL_VECTOR -.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 - .balign 32 - .rept 7 - .if vector < FIRST_SYSTEM_VECTOR - .if vector <> FIRST_EXTERNAL_VECTOR + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt CFI_ADJUST_CFA_OFFSET -8 - .endif -1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 - jmp 2f - .endif - .previous - .quad 1b - .section .entry.text -vector=vector+1 - .endif - .endr -2: jmp common_interrupt -.endr + .align 8 + .endr CFI_ENDPROC END(irq_entries_start) -.previous -END(interrupt) -.previous - /* * Interrupt entry/exit. * diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 70e181ea1eac..cd10a6437264 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -178,7 +178,8 @@ void __init native_init_IRQ(void) #endif for_each_clear_bit_from(i, used_vectors, first_system_vector) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, irq_entries_start + + 8 * (i - FIRST_EXTERNAL_VECTOR)); } #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, used_vectors, NR_VECTORS) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 8561585ee2c6..717908b16037 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -868,7 +868,8 @@ static void __init lguest_init_IRQ(void) /* Some systems map "vectors" to interrupts weirdly. Not us! */ __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR); if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, irq_entries_start + + 8 * (i - FIRST_EXTERNAL_VECTOR)); } /* From 8b3607b5b8c591d8bf045911cb7c90ecaa6e7b73 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 18:42:47 +0200 Subject: [PATCH 135/148] x86/asm/entry/64: Add forgotten CFI annotation Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428424967-14460-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 4ca03c518ab4..3197f41ce320 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -506,6 +506,7 @@ return_from_execve: 1: /* must use IRET code path (pt_regs->cs may have changed) */ addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 ZERO_EXTRA_REGS movq %rax,RAX(%rsp) jmp int_ret_from_sys_call From 31f0119b817f6474a7b4c48fed7588af1b62c543 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:37 +0200 Subject: [PATCH 136/148] x86/asm/entry/64: Use common code for rt_sigreturn() epilogue Similarly to stub_execve, we can reuse the epilogue in stub_rt_sigreturn() and stub_x32_rt_sigreturn(). Add a comment explaining why we can't eliminage SAVE_EXTRA_REGS here. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3197f41ce320..5252e6021826 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -527,12 +527,21 @@ END(stub_execveat) */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp - DEFAULT_FRAME 0 - SAVE_EXTRA_REGS + DEFAULT_FRAME 0, 8 + /* + * SAVE_EXTRA_REGS result is not normally needed: + * sigreturn overwrites all pt_regs->GPREGS. + * But sigreturn can fail (!), and there is no easy way to detect that. + * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, + * we SAVE_EXTRA_REGS here. + */ + SAVE_EXTRA_REGS 8 call sys_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer +return_from_stub: + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 RESTORE_EXTRA_REGS + movq %rax,RAX(%rsp) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_rt_sigreturn) @@ -540,13 +549,10 @@ END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp - DEFAULT_FRAME 0 - SAVE_EXTRA_REGS + DEFAULT_FRAME 0, 8 + SAVE_EXTRA_REGS 8 call sys32_x32_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call + jmp return_from_stub CFI_ENDPROC END(stub_x32_rt_sigreturn) From 05f1752d195c145d02ae40881d0985c2cfbee473 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:38 +0200 Subject: [PATCH 137/148] x86/asm/entry/64: Move stub_x32_execvecloser() to stub_execveat() This is a preparatory patch for moving stub32_execve[at]() to this file. It makes sense to have all execve stubs in one place, so that they can reuse code. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 5252e6021826..f7d9ba6f73a3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -521,6 +521,23 @@ ENTRY(stub_execveat) CFI_ENDPROC END(stub_execveat) +#ifdef CONFIG_X86_X32_ABI +ENTRY(stub_x32_execve) + CFI_STARTPROC + DEFAULT_FRAME 0, 8 + call compat_sys_execve + jmp return_from_execve + CFI_ENDPROC +END(stub_x32_execve) +ENTRY(stub_x32_execveat) + CFI_STARTPROC + DEFAULT_FRAME 0, 8 + call compat_sys_execveat + jmp return_from_execve + CFI_ENDPROC +END(stub_x32_execveat) +#endif + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. @@ -555,23 +572,6 @@ ENTRY(stub_x32_rt_sigreturn) jmp return_from_stub CFI_ENDPROC END(stub_x32_rt_sigreturn) - -ENTRY(stub_x32_execve) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execve - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execve) - -ENTRY(stub_x32_execveat) - CFI_STARTPROC - DEFAULT_FRAME 0, 8 - call compat_sys_execveat - jmp return_from_execve - CFI_ENDPROC -END(stub_x32_execveat) - #endif /* From 0f90fb979d7b53d80a6d5cb6e127b4b4b249907e Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:39 +0200 Subject: [PATCH 138/148] x86/asm/entry: Zero EXTRA_REGS for stub32_execve() too The change which affected how execve clears EXTRA_REGS missed 32-bit execve syscalls. Fix this by using 64-bit execve stub epilogue for them too. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/ia32/ia32entry.S | 2 -- arch/x86/kernel/entry_64.S | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 5d8f987a340d..a821b1cd4fa7 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -571,8 +571,6 @@ GLOBAL(\label) PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_execve, compat_sys_execve - PTREGSCALL stub32_execveat, compat_sys_execveat PTREGSCALL stub32_fork, sys_fork PTREGSCALL stub32_vfork, sys_vfork diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index f7d9ba6f73a3..5380b3a8f3e5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -538,6 +538,21 @@ ENTRY(stub_x32_execveat) END(stub_x32_execveat) #endif +#ifdef CONFIG_IA32_EMULATION +ENTRY(stub32_execve) + CFI_STARTPROC + call compat_sys_execve + jmp return_from_execve + CFI_ENDPROC +END(stub32_execve) +ENTRY(stub32_execveat) + CFI_STARTPROC + call compat_sys_execveat + jmp return_from_execve + CFI_ENDPROC +END(stub32_execveat) +#endif + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. From 772951c4e4b06cdffeff499259dba07b544f3166 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:40 +0200 Subject: [PATCH 139/148] x86/asm/entry/64: Optimize [v]fork/clone stubs Replace "call func; ret" with "jmp func". Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 5380b3a8f3e5..ce852565443a 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -484,8 +484,7 @@ ENTRY(stub_\func) CFI_STARTPROC DEFAULT_FRAME 0, 8 /* offset 8: return address */ SAVE_EXTRA_REGS 8 - call sys_\func - ret + jmp sys_\func CFI_ENDPROC END(stub_\func) .endm From a30b0085f54efae11f6256df4e4a16af7eefc1c4 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:41 +0200 Subject: [PATCH 140/148] x86/asm/entry/64: Remove a redundant jump Jumping to the very next instruction is not very useful: jmp label label: Removing the jump. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-5-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ce852565443a..e8ddd5196ce7 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1448,7 +1448,6 @@ ENTRY(nmi) /* If it is below the NMI stack, it is a normal NMI */ jb first_nmi /* Ah, it is within the NMI stack, treat it as nested */ - jmp nested_nmi CFI_REMEMBER_STATE From 66ad4efa51805964521db03d8aa827a8dd9058b9 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:42 +0200 Subject: [PATCH 141/148] x86/asm/entry/64: Simplify jumps in ret_from_fork Replace test jz 1f jmp label 1: with test jnz label Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-6-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e8ddd5196ce7..a35e5e4435ef 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -608,18 +608,18 @@ ENTRY(ret_from_fork) RESTORE_EXTRA_REGS testl $3,CS(%rsp) # from kernel_thread? - jz 1f /* * By the time we get here, we have no idea whether our pt_regs, * ti flags, and ti status came from the 64-bit SYSCALL fast path, * the slow path, or one of the ia32entry paths. - * Use int_ret_from_sys_call to return, since it can safely handle + * Use IRET code path to return, since it can safely handle * all of the above. */ - jmp int_ret_from_sys_call + jnz int_ret_from_sys_call -1: + /* We came from kernel_thread */ + /* nb: we depend on RESTORE_EXTRA_REGS above */ movq %rbp, %rdi call *%rbx movl $0, RAX(%rsp) From 54a81e914b2432a86dd49cf611b0f71ef44ca7ad Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:43 +0200 Subject: [PATCH 142/148] x86/asm/entry/64: Remove GET_THREAD_INFO() in ret_from_fork It used to be used to check for _TIF_IA32, but the check has been removed. Remove GET_THREAD_INFO() too. Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-7-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a35e5e4435ef..b67f2fc0d160 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -603,8 +603,6 @@ ENTRY(ret_from_fork) call schedule_tail # rdi: 'prev' task parameter - GET_THREAD_INFO(%rcx) - RESTORE_EXTRA_REGS testl $3,CS(%rsp) # from kernel_thread? From a37f34a325d90856314ccd4994e1070dcc6bdcc4 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Tue, 7 Apr 2015 22:43:44 +0200 Subject: [PATCH 143/148] x86/asm/entry/64: Reduce padding in execve stubs execve stubs are 7 bytes only. Padding them to 16 bytes is a waste. text data bss dec hex filename 12594 0 0 12594 3132 entry_64.o.before 12530 0 0 12530 30f2 entry_64.o Run-tested. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1428439424-7258-8-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index b67f2fc0d160..c7b238494b31 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -511,8 +511,12 @@ return_from_execve: jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - -ENTRY(stub_execveat) +/* + * Remaining execve stubs are only 7 bytes long. + * ENTRY() often aligns to 16 bytes, which in this case has no benefits. + */ + .align 8 +GLOBAL(stub_execveat) CFI_STARTPROC DEFAULT_FRAME 0, 8 call sys_execveat @@ -521,14 +525,16 @@ ENTRY(stub_execveat) END(stub_execveat) #ifdef CONFIG_X86_X32_ABI -ENTRY(stub_x32_execve) + .align 8 +GLOBAL(stub_x32_execve) CFI_STARTPROC DEFAULT_FRAME 0, 8 call compat_sys_execve jmp return_from_execve CFI_ENDPROC END(stub_x32_execve) -ENTRY(stub_x32_execveat) + .align 8 +GLOBAL(stub_x32_execveat) CFI_STARTPROC DEFAULT_FRAME 0, 8 call compat_sys_execveat @@ -538,13 +544,15 @@ END(stub_x32_execveat) #endif #ifdef CONFIG_IA32_EMULATION -ENTRY(stub32_execve) + .align 8 +GLOBAL(stub32_execve) CFI_STARTPROC call compat_sys_execve jmp return_from_execve CFI_ENDPROC END(stub32_execve) -ENTRY(stub32_execveat) + .align 8 +GLOBAL(stub32_execveat) CFI_STARTPROC call compat_sys_execveat jmp return_from_execve From 54cfa458b492fcf76d653698d362743bcb329055 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 10 Apr 2015 20:13:40 +0200 Subject: [PATCH 144/148] x86/asm/entry/32: Tidy up JNZ instructions after TESTs After TESTs, use logically correct JNZ mnemonic instead of JNE. This doesn't change code: md5: c3005b39a11fe582b7df7908561ad4ee entry_32.o.before.asm c3005b39a11fe582b7df7908561ad4ee entry_32.o.after.asm Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Steven Rostedt Cc: Will Drewry Link: http://lkml.kernel.org/r/1428689620-21881-1-git-send-email-dvlasenk@redhat.com [ Added object file comparison. ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 02bec0f1d1e1..1c309763e321 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -435,7 +435,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jne sysexit_audit + jnz sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -463,7 +463,7 @@ sysenter_audit: sysexit_audit: testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jne syscall_exit_work + jnz syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ @@ -475,7 +475,7 @@ sysexit_audit: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jne syscall_exit_work + jnz syscall_exit_work movl PT_EAX(%esp),%eax /* reload syscall return value */ jmp sysenter_exit #endif @@ -513,7 +513,7 @@ syscall_exit: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx # current->work - jne syscall_exit_work + jnz syscall_exit_work restore_all: TRACE_IRQS_IRET @@ -615,7 +615,7 @@ work_notifysig: # deal with pending signals and #ifdef CONFIG_VM86 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) movl %esp, %eax - jne work_notifysig_v86 # returning to kernel-space or + jnz work_notifysig_v86 # returning to kernel-space or # vm86-space 1: #else From aa21df0424468dbd991440d41ba0f700c5997103 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 10 Apr 2015 15:06:56 +0200 Subject: [PATCH 145/148] perf/x86/64: Do not guess user_regs->cs, ss, sp in get_regs_user() After recent changes to syscall entry points, user_regs->{cs,ss,sp} are always correct. (They used to be undefined while in syscalls). We can report them reliably, without guessing. Signed-off-by: Denys Vlasenko Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1428671219-29341-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/perf_regs.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 02a8720414c0..7ab198acc5b8 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -131,8 +131,8 @@ void perf_get_regs_user(struct perf_regs *regs_user, } /* - * RIP, flags, and the argument registers are usually saved. - * orig_ax is probably okay, too. + * These registers are always saved on 64-bit syscall entry. + * On 32-bit entry points, they are saved too except r8..r11. */ regs_user_copy->ip = user_regs->ip; regs_user_copy->cx = user_regs->cx; @@ -145,9 +145,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, regs_user_copy->r11 = user_regs->r11; regs_user_copy->orig_ax = user_regs->orig_ax; regs_user_copy->flags = user_regs->flags; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; /* - * Don't even try to report the "rest" regs. + * Most system calls don't save these registers, don't report them. */ regs_user_copy->bx = -1; regs_user_copy->bp = -1; @@ -158,7 +161,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, /* * For this to be at all useful, we need a reasonable guess for - * sp and the ABI. Be careful: we're in NMI context, and we're + * the ABI. Be careful: we're in NMI context, and we're * considering current to be the current task, so we should * be careful not to look at any other percpu variables that might * change during context switches. @@ -167,9 +170,6 @@ void perf_get_regs_user(struct perf_regs *regs_user, task_thread_info(current)->status & TS_COMPAT) { /* Easy case: we're in a compat syscall. */ regs_user->abi = PERF_SAMPLE_REGS_ABI_32; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = user_regs->cs; - regs_user_copy->ss = user_regs->ss; } else if (user_regs->orig_ax != -1) { /* * We're probably in a 64-bit syscall. @@ -177,17 +177,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, * than just blindly copying user_regs. */ regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = __USER_CS; - regs_user_copy->ss = __USER_DS; - regs_user_copy->cx = -1; /* usually contains garbage */ + /* usually contains return address (same as ->ip) */ + regs_user_copy->cx = -1; } else { /* We're probably in an interrupt or exception. */ regs_user->abi = user_64bit_mode(user_regs) ? PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = user_regs->cs; - regs_user_copy->ss = user_regs->ss; } regs_user->regs = regs_user_copy; From 5df71b396b2d1fdd9d9f5a33e2eda5dc27c5632d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 10 Apr 2015 15:06:57 +0200 Subject: [PATCH 146/148] perf/x86/64: Do report user_regs->cx while we are in syscall, in get_regs_user() Yes, it is true that cx contains return address. It's not clear why we trash it. Stop doing that. Signed-off-by: Denys Vlasenko Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1428671219-29341-2-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/perf_regs.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 7ab198acc5b8..a8d4e4862ace 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -177,8 +177,6 @@ void perf_get_regs_user(struct perf_regs *regs_user, * than just blindly copying user_regs. */ regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - /* usually contains return address (same as ->ip) */ - regs_user_copy->cx = -1; } else { /* We're probably in an interrupt or exception. */ regs_user->abi = user_64bit_mode(user_regs) ? From 32caa06091cc59651222cdc971dc21eaab36b097 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 10 Apr 2015 15:06:58 +0200 Subject: [PATCH 147/148] perf/x86/64: Simplify regs_user->abi setting code in get_regs_user() user_64bit_mode(regs) basically checks regs->cs to point to a 64-bit segment. This check used to be unreliable here because regs->cs was not always correct in syscalls. Now regs->cs is always correct: in syscalls, in interrupts, in exceptions. No need to emply heuristics here. Signed-off-by: Denys Vlasenko Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1428671219-29341-3-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/perf_regs.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index a8d4e4862ace..8157d3900bc2 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -166,22 +166,8 @@ void perf_get_regs_user(struct perf_regs *regs_user, * be careful not to look at any other percpu variables that might * change during context switches. */ - if (IS_ENABLED(CONFIG_IA32_EMULATION) && - task_thread_info(current)->status & TS_COMPAT) { - /* Easy case: we're in a compat syscall. */ - regs_user->abi = PERF_SAMPLE_REGS_ABI_32; - } else if (user_regs->orig_ax != -1) { - /* - * We're probably in a 64-bit syscall. - * Warning: this code is severely racy. At least it's better - * than just blindly copying user_regs. - */ - regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - } else { - /* We're probably in an interrupt or exception. */ - regs_user->abi = user_64bit_mode(user_regs) ? - PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; - } + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; regs_user->regs = regs_user_copy; } From 3b75232d55680ca166dffa274d0587d5faf0a016 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 10 Apr 2015 15:06:59 +0200 Subject: [PATCH 148/148] perf/x86/64: Report regs_user->ax too in get_regs_user() I don't see why we report e.g. orix_ax, which is not always meaningful, but don't report ax, which is meaningful. Signed-off-by: Denys Vlasenko Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Paul Mackerras Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1428671219-29341-4-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/perf_regs.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 8157d3900bc2..da8cb987b973 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -135,6 +135,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, * On 32-bit entry points, they are saved too except r8..r11. */ regs_user_copy->ip = user_regs->ip; + regs_user_copy->ax = user_regs->ax; regs_user_copy->cx = user_regs->cx; regs_user_copy->dx = user_regs->dx; regs_user_copy->si = user_regs->si;