diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index dc27705..34ed3a2 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -6,6 +6,7 @@ #include #include #include +#include static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info) @@ -95,6 +96,9 @@ static inline int desc_empty(const void *ptr) #define load_TLS(t, cpu) native_load_tls(t, cpu) #define set_ldt native_set_ldt +#ifdef CONFIG_X86_32 +#define load_user_cs_desc native_load_user_cs_desc +#endif /*CONFIG_X86_32*/ #define write_ldt_entry(dt, entry, desc) \ native_write_ldt_entry(dt, entry, desc) @@ -379,6 +383,27 @@ static inline void set_system_intr_gate_ist(int n, void *addr, unsigned ist) _set_gate(n, GATE_INTERRUPT, addr, 0x3, ist, __KERNEL_CS); } +#ifdef CONFIG_X86_32 +static inline void set_user_cs(struct desc_struct *desc, unsigned long limit) +{ + limit = (limit - 1) / PAGE_SIZE; + desc->a = limit & 0xffff; + desc->b = (limit & 0xf0000) | 0x00c0fb00; +} + +static inline void native_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS] = (mm)->context.user_cs; +} + +#define arch_add_exec_range arch_add_exec_range +#define arch_remove_exec_range arch_remove_exec_range +#define arch_flush_exec_range arch_flush_exec_range +extern void arch_add_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_remove_exec_range(struct mm_struct *mm, unsigned long limit); +extern void arch_flush_exec_range(struct mm_struct *mm); +#endif /* CONFIG_X86_32 */ + #else /* * GET_DESC_BASE reads the descriptor base of the specified segment. diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 80a1dee..8314c66 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -7,12 +7,19 @@ /* * The x86 doesn't have a mmu context, but * we put the segment information here. + * + * exec_limit is used to track the range PROT_EXEC + * mappings span. */ typedef struct { void *ldt; int size; struct mutex lock; void *vdso; +#ifdef CONFIG_X86_32 + struct desc_struct user_cs; + unsigned long exec_limit; +#endif } mm_context_t; #ifdef CONFIG_SMP diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index e299287..aaa8a35 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -113,6 +113,9 @@ struct pv_cpu_ops { void (*store_gdt)(struct desc_ptr *); void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); +#ifdef CONFIG_X86_32 + void (*load_user_cs_desc)(int cpu, struct mm_struct *mm); +#endif /*CONFIG_X86_32*/ unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); #ifdef CONFIG_X86_64 @@ -860,6 +863,12 @@ static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } +#ifdef CONFIG_X86_32 +static inline void load_user_cs_desc(unsigned int cpu, struct mm_struct *mm) +{ + PVOP_VCALL2(pv_cpu_ops.load_user_cs_desc, cpu, mm); +} +#endif /*CONFIG_X86_32*/ static inline void store_gdt(struct desc_ptr *dtr) { PVOP_VCALL1(pv_cpu_ops.store_gdt, dtr); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 3bfd523..99c8119 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -158,6 +158,9 @@ static inline int hlt_works(int cpu) #define cache_line_size() (boot_cpu_data.x86_cache_alignment) +#define __HAVE_ARCH_ALIGN_STACK +extern unsigned long arch_align_stack(unsigned long sp); + extern void cpu_detect(struct cpuinfo_x86 *c); extern struct pt_regs *idle_regs(struct pt_regs *); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 83492b1..a84c787 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -708,6 +708,21 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) * we do "generic changes." */ + /* + * emulation of NX with segment limits unfortunately means + * we have to disable the fast system calls, due to the way that + * sysexit clears the segment limits on return. + * If we have either disabled exec-shield on the boot command line, + * or we have NX, then we don't need to do this. + */ + if (exec_shield != 0) { +#ifdef CONFIG_X86_PAE + if (!test_cpu_cap(c, X86_FEATURE_NX)) +#endif + clear_cpu_cap(c, X86_FEATURE_SEP); + } + + /* If the model name is still unset, do table lookup. */ if (!c->x86_model_id[0]) { char *p; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c6520a4..2066aa1 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -352,6 +352,9 @@ struct pv_cpu_ops pv_cpu_ops = { .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = native_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = native_load_gdt, .load_idt = native_load_idt, .store_gdt = native_store_gdt, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd4da2a..60823d4 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -343,6 +343,8 @@ int copy_thread(int nr, unsigned long clone_flags, unsigned long sp, void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) { + int cpu; + __asm__("movl %0, %%gs" : : "r"(0)); regs->fs = 0; set_fs(USER_DS); @@ -352,6 +354,11 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->cs = __USER_CS; regs->ip = new_ip; regs->sp = new_sp; + + cpu = get_cpu(); + load_user_cs_desc(cpu, current->mm); + put_cpu(); + /* * Free the old FP and other extended state */ @@ -519,7 +526,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ __unlazy_fpu(prev_p); - + if (next_p->mm) + load_user_cs_desc(cpu, next_p->mm); /* we're going to use this soon, after a few expensive things */ if (next_p->fpu_counter > 5) @@ -692,3 +700,41 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) unsigned long range_end = mm->brk + 0x02000000; return randomize_range(mm->brk, range_end, 0) ? : mm->brk; } + +static void modify_cs(struct mm_struct *mm, unsigned long limit) +{ + mm->context.exec_limit = limit; + set_user_cs(&mm->context.user_cs, limit); + if (mm == current->mm) { + int cpu; + + cpu = get_cpu(); + load_user_cs_desc(cpu, mm); + put_cpu(); + } +} + +void arch_add_exec_range(struct mm_struct *mm, unsigned long limit) +{ + if (limit > mm->context.exec_limit) + modify_cs(mm, limit); +} + +void arch_remove_exec_range(struct mm_struct *mm, unsigned long old_end) +{ + struct vm_area_struct *vma; + unsigned long limit = PAGE_SIZE; + + if (old_end == mm->context.exec_limit) { + for (vma = mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + modify_cs(mm, limit); + } +} + +void arch_flush_exec_range(struct mm_struct *mm) +{ + mm->context.exec_limit = 0; + set_user_cs(&mm->context.user_cs, 0); +} diff --git a/arch/x86/kernel/tlb_32.c b/arch/x86/kernel/tlb_32.c index ce50546..bd6593a 100644 --- a/arch/x86/kernel/tlb_32.c +++ b/arch/x86/kernel/tlb_32.c @@ -2,6 +2,7 @@ #include #include +#include #include DEFINE_PER_CPU(struct tlb_state, cpu_tlbstate) @@ -91,6 +92,8 @@ void smp_invalidate_interrupt(struct pt_regs *regs) unsigned long cpu; cpu = get_cpu(); + if (current->active_mm) + load_user_cs_desc(cpu, current->active_mm); if (!cpu_isset(cpu, flush_cpumask)) goto out; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a9e7548..af0f8f0 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -160,6 +160,76 @@ static int lazy_iobitmap_copy(void) return 0; } + +static inline int +__compare_user_cs_desc(const struct desc_struct *desc1, + const struct desc_struct *desc2) +{ + return ((desc1->limit0 != desc2->limit0) || + (desc1->limit != desc2->limit) || + (desc1->base0 != desc2->base0) || + (desc1->base1 != desc2->base1) || + (desc1->base2 != desc2->base2)); +} + +/* + * lazy-check for CS validity on exec-shield binaries: + * + * the original non-exec stack patch was written by + * Solar Designer . Thanks! + */ +static int +check_lazy_exec_limit(int cpu, struct pt_regs *regs, long error_code) +{ + struct desc_struct *desc1, *desc2; + struct vm_area_struct *vma; + unsigned long limit; + + if (current->mm == NULL) + return 0; + + limit = -1UL; + if (current->mm->context.exec_limit != -1UL) { + limit = PAGE_SIZE; + spin_lock(¤t->mm->page_table_lock); + for (vma = current->mm->mmap; vma; vma = vma->vm_next) + if ((vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + vma = get_gate_vma(current); + if (vma && (vma->vm_flags & VM_EXEC) && (vma->vm_end > limit)) + limit = vma->vm_end; + spin_unlock(¤t->mm->page_table_lock); + if (limit >= TASK_SIZE) + limit = -1UL; + current->mm->context.exec_limit = limit; + } + set_user_cs(¤t->mm->context.user_cs, limit); + + desc1 = ¤t->mm->context.user_cs; + desc2 = get_cpu_gdt_table(cpu) + GDT_ENTRY_DEFAULT_USER_CS; + + if (__compare_user_cs_desc(desc1, desc2)) { + /* + * The CS was not in sync - reload it and retry the + * instruction. If the instruction still faults then + * we won't hit this branch next time around. + */ + if (print_fatal_signals >= 2) { + printk(KERN_ERR "#GPF fixup (%ld[seg:%lx]) at %08lx, CPU#%d.\n", + error_code, error_code/8, regs->ip, + smp_processor_id()); + printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x, CPU_cs: %08x/%08x.\n", + current->mm->context.exec_limit, + desc1->a, desc1->b, desc2->a, desc2->b); + } + + load_user_cs_desc(cpu, current->mm); + + return 1; + } + + return 0; +} #endif static void __kprobes @@ -323,6 +393,29 @@ do_general_protection(struct pt_regs *regs, long error_code) if (!user_mode(regs)) goto gp_in_kernel; +#ifdef CONFIG_X86_32 +{ + int cpu; + int ok; + + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (ok) + return; + + if (print_fatal_signals) { + printk(KERN_ERR "#GPF(%ld[seg:%lx]) at %08lx, CPU#%d.\n", + error_code, error_code/8, regs->ip, smp_processor_id()); + printk(KERN_ERR "exec_limit: %08lx, user_cs: %08x/%08x.\n", + current->mm->context.exec_limit, + current->mm->context.user_cs.a, + current->mm->context.user_cs.b); + } +} +#endif /*CONFIG_X86_32*/ + tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; @@ -934,19 +1027,37 @@ dotraplinkage void __kprobes do_device_not_available(struct pt_regs regs) } #ifdef CONFIG_X86_32 +/* + * The fixup code for errors in iret jumps to here (iret_exc). It loses + * the original trap number and erorr code. The bogus trap 32 and error + * code 0 are what the vanilla kernel delivers via: + * DO_ERROR_INFO(32, SIGSEGV, "iret exception", iret_error, ILL_BADSTK, 0, 1) + * + * NOTE: Because of the final "1" in the macro we need to enable interrupts. + * + * In case of a general protection fault in the iret instruction, we + * need to check for a lazy CS update for exec-shield. + */ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { - siginfo_t info; + int ok; + int cpu; + local_irq_enable(); - info.si_signo = SIGILL; - info.si_errno = 0; - info.si_code = ILL_BADSTK; - info.si_addr = 0; - if (notify_die(DIE_TRAP, "iret exception", - regs, error_code, 32, SIGILL) == NOTIFY_STOP) - return; - do_trap(32, SIGILL, "iret exception", regs, error_code, &info); + cpu = get_cpu(); + ok = check_lazy_exec_limit(cpu, regs, error_code); + put_cpu(); + + if (!ok && notify_die(DIE_TRAP, "iret exception", regs, + error_code, 32, SIGSEGV) != NOTIFY_STOP) { + siginfo_t info; + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = ILL_BADSTK; + info.si_addr = 0; + do_trap(32, SIGSEGV, "iret exception", 0, error_code, &info); + } } #endif diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2cef050..a18ae07 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -617,7 +617,7 @@ static int disable_nx __initdata; * Control non executable mappings. * * on Enable - * off Disable + * off Disable (disables exec-shield too) */ static int __init noexec_setup(char *str) { @@ -626,14 +626,12 @@ static int __init noexec_setup(char *str) __supported_pte_mask |= _PAGE_NX; disable_nx = 0; } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } + } else if (!strcmp(str, "off")) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + exec_shield = 0; + } else + return -EINVAL; return 0; } @@ -892,7 +890,11 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); + else #endif + if (exec_shield) + printk(KERN_INFO "Using x86 segment limits to approximate " + "NX protection\n"); /* Enable PSE if available */ if (cpu_has_pse) diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 56fe712..30d2be7 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -111,13 +111,16 @@ static unsigned long mmap_legacy_base(void) */ void arch_pick_mmap_layout(struct mm_struct *mm) { - if (mmap_is_legacy()) { + if (!(2 & exec_shield) && mmap_is_legacy()) { mm->mmap_base = mmap_legacy_base(); mm->get_unmapped_area = arch_get_unmapped_area; mm->unmap_area = arch_unmap_area; } else { mm->mmap_base = mmap_base(); mm->get_unmapped_area = arch_get_unmapped_area_topdown; + if (!(current->personality & READ_IMPLIES_EXEC) + && mmap_is_ia32()) + mm->get_unmapped_exec_area = arch_get_unmapped_exec_area; mm->unmap_area = arch_unmap_area_topdown; } } diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 1241f11..3f2c44c 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -331,7 +331,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (compat) addr = VDSO_HIGH_BASE; else { - addr = get_unmapped_area(NULL, 0, PAGE_SIZE, 0, 0); + addr = get_unmapped_area_prot(NULL, 0, PAGE_SIZE, 0, 0, 1); if (IS_ERR_VALUE(addr)) { ret = addr; goto up_fail; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b58e963..cdc83ce 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -316,6 +316,24 @@ static void xen_set_ldt(const void *addr, unsigned entries) xen_mc_issue(PARAVIRT_LAZY_CPU); } +#ifdef CONFIG_X86_32 +static void xen_load_user_cs_desc(int cpu, struct mm_struct *mm) +{ + void *gdt; + xmaddr_t mgdt; + u64 descriptor; + struct desc_struct user_cs; + + gdt = &get_cpu_gdt_table(cpu)[GDT_ENTRY_DEFAULT_USER_CS]; + mgdt = virt_to_machine(gdt); + + user_cs = mm->context.user_cs; + descriptor = (u64) user_cs.a | ((u64) user_cs.b) << 32; + + HYPERVISOR_update_descriptor(mgdt.maddr, descriptor); +} +#endif /*CONFIG_X86_32*/ + static void xen_load_gdt(const struct desc_ptr *dtr) { unsigned long *frames; @@ -1232,6 +1250,9 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .load_tr_desc = paravirt_nop, .set_ldt = xen_set_ldt, +#ifdef CONFIG_X86_32 + .load_user_cs_desc = xen_load_user_cs_desc, +#endif /*CONFIG_X86_32*/ .load_gdt = xen_load_gdt, .load_idt = xen_load_idt, .load_tls = xen_load_tls, diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 33b7235..ce1f044 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -80,7 +80,7 @@ static struct linux_binfmt elf_format = { .hasvdso = 1 }; -#define BAD_ADDR(x) ((unsigned long)(x) >= TASK_SIZE) +#define BAD_ADDR(x) IS_ERR_VALUE(x) static int set_brk(unsigned long start, unsigned long end) { @@ -735,6 +735,11 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) break; } + if (current->personality == PER_LINUX && (exec_shield & 2)) { + executable_stack = EXSTACK_DISABLE_X; + current->flags |= PF_RANDOMIZE; + } + /* Some simple consistency checks for the interpreter */ if (elf_interpreter) { retval = -ELIBBAD; @@ -754,6 +759,15 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) goto out_free_dentry; +#ifdef CONFIG_X86_32 + /* + * Turn off the CS limit completely if exec-shield disabled or + * NX active: + */ + if (!exec_shield || executable_stack != EXSTACK_DISABLE_X || nx_enabled) + arch_add_exec_range(current->mm, -1); +#endif + /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; @@ -761,7 +775,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ SET_PERSONALITY(loc->elf_ex); - if (elf_read_implies_exec(loc->elf_ex, executable_stack)) + if (!(exec_shield & 2) && + elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) @@ -926,7 +941,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) interpreter, &interp_map_addr, load_bias); - if (!IS_ERR((void *)elf_entry)) { + if (!BAD_ADDR(elf_entry)) { /* * load_elf_interp() returns relocation * adjustment diff --git a/include/linux/mm.h b/include/linux/mm.h index 065cdf8..aa94aa9 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1135,7 +1135,13 @@ extern int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long flags, struct page **pages); -extern unsigned long get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); +extern unsigned long get_unmapped_area_prot(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, int); + +static inline unsigned long get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, unsigned long flags) +{ + return get_unmapped_area_prot(file, addr, len, pgoff, flags, 0); +} extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 92915e8..4bfd050 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -194,6 +194,9 @@ struct mm_struct { unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); + unsigned long (*get_unmapped_exec_area) (struct file *filp, + unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags); void (*unmap_area) (struct mm_struct *mm, unsigned long addr); unsigned long mmap_base; /* base of mmap area */ unsigned long task_size; /* size of task vm space */ diff --git a/include/linux/resource.h b/include/linux/resource.h index 40fc7e6..68c2549 100644 --- a/include/linux/resource.h +++ b/include/linux/resource.h @@ -55,8 +55,11 @@ struct rlimit { /* * Limit the stack by to some sane default: root can always * increase this limit if needed.. 8MB seems reasonable. + * + * (2MB more to cover randomization effects.) */ -#define _STK_LIM (8*1024*1024) +#define _STK_LIM (10*1024*1024) +#define EXEC_STACK_BIAS (2*1024*1024) /* * GPG2 wants 64kB of mlocked memory, to make sure pass phrases diff --git a/include/linux/sched.h b/include/linux/sched.h index 8c216e0..79eca33 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -98,6 +98,9 @@ struct robust_list_head; struct bio; struct bts_tracer; +extern int exec_shield; +extern int print_fatal_signals; + /* * List of flags we want to share for kernel threads, * if only because they are not used by them anyway. @@ -346,6 +349,10 @@ extern int sysctl_max_map_count; extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + +extern unsigned long +arch_get_unmapped_exec_area(struct file *, unsigned long, unsigned long, + unsigned long, unsigned long); extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c5ef44f..f7abce4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -85,6 +85,26 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max; #ifndef CONFIG_MMU extern int sysctl_nr_trim_pages; #endif + +int exec_shield = (1<<0); +/* exec_shield is a bitmask: + * 0: off; vdso at STACK_TOP, 1 page below TASK_SIZE + * (1<<0) 1: on [also on if !=0] + * (1<<1) 2: force noexecstack regardless of PT_GNU_STACK + * The old settings + * (1<<2) 4: vdso just below .text of main (unless too low) + * (1<<3) 8: vdso just below .text of PT_INTERP (unless too low) + * are ignored because the vdso is placed completely randomly + */ + +static int __init setup_exec_shield(char *str) +{ + get_option(&str, &exec_shield); + + return 1; +} +__setup("exec-shield=", setup_exec_shield); + #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ @@ -379,6 +399,14 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dointvec, }, { + .ctl_name = CTL_UNNUMBERED, + .procname = "exec-shield", + .data = &exec_shield, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { .ctl_name = KERN_CORE_USES_PID, .procname = "core_uses_pid", .data = &core_uses_pid, diff --git a/mm/mmap.c b/mm/mmap.c index 00ced3e..931bc3b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,18 @@ #define arch_rebalance_pgtables(addr, len) (addr) #endif +/* No sane architecture will #define these to anything else */ +#ifndef arch_add_exec_range +#define arch_add_exec_range(mm, limit) do { ; } while (0) +#endif +#ifndef arch_flush_exec_range +#define arch_flush_exec_range(mm) do { ; } while (0) +#endif +#ifndef arch_remove_exec_range +#define arch_remove_exec_range(mm, limit) do { ; } while (0) +#endif + + static void unmap_region(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, unsigned long start, unsigned long end); @@ -391,6 +404,8 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + if (vma->vm_flags & VM_EXEC) + arch_add_exec_range(mm, vma->vm_end); if (prev) { vma->vm_next = prev->vm_next; prev->vm_next = vma; @@ -493,6 +508,8 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; + if (vma->vm_flags & VM_EXEC) + arch_remove_exec_range(mm, vma->vm_end); } /* @@ -802,6 +819,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } else /* cases 2, 5, 7 */ vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (prev->vm_flags & VM_EXEC) + arch_add_exec_range(mm, prev->vm_end); return prev; } @@ -956,7 +975,8 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, /* Obtain the address to map to. we verify (or select) it and ensure * that it represents a valid section of the address space. */ - addr = get_unmapped_area(file, addr, len, pgoff, flags); + addr = get_unmapped_area_prot(file, addr, len, pgoff, flags, + prot & PROT_EXEC); if (addr & ~PAGE_MASK) return addr; @@ -1436,13 +1456,17 @@ void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) } unsigned long -get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) +get_unmapped_area_prot(struct file *file, unsigned long addr, unsigned long len, + unsigned long pgoff, unsigned long flags, int exec) { unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); - get_area = current->mm->get_unmapped_area; + if (exec && current->mm->get_unmapped_exec_area) + get_area = current->mm->get_unmapped_exec_area; + else + get_area = current->mm->get_unmapped_area; + if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; addr = get_area(file, addr, len, pgoff, flags); @@ -1456,8 +1480,76 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return arch_rebalance_pgtables(addr, len); } +EXPORT_SYMBOL(get_unmapped_area_prot); + +#define SHLIB_BASE 0x00110000 + +unsigned long +arch_get_unmapped_exec_area(struct file *filp, unsigned long addr0, + unsigned long len0, unsigned long pgoff, unsigned long flags) +{ + unsigned long addr = addr0, len = len0; + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long tmp; + + if (len > TASK_SIZE) + return -ENOMEM; + + if (flags & MAP_FIXED) + return addr; + + if (!addr) + addr = randomize_range(SHLIB_BASE, 0x01000000, len); + + if (addr) { + addr = PAGE_ALIGN(addr); + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + addr = SHLIB_BASE; + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { + /* At this point: (!vma || addr < vma->vm_end). */ + if (TASK_SIZE - len < addr) + return -ENOMEM; + + if (!vma || addr + len <= vma->vm_start) { + /* + * Must not let a PROT_EXEC mapping get into the + * brk area: + */ + if (addr + len > mm->brk) + goto failed; + + /* + * Up until the brk area we randomize addresses + * as much as possible: + */ + if (addr >= 0x01000000) { + tmp = randomize_range(0x01000000, + PAGE_ALIGN(max(mm->start_brk, + (unsigned long)0x08000000)), len); + vma = find_vma(mm, tmp); + if (TASK_SIZE - len >= tmp && + (!vma || tmp + len <= vma->vm_start)) + return tmp; + } + /* + * Ok, randomization didnt work out - return + * the result of the linear search: + */ + return addr; + } + addr = vma->vm_end; + } + +failed: + return current->mm->get_unmapped_area(filp, addr0, len0, pgoff, flags); +} -EXPORT_SYMBOL(get_unmapped_area); /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) @@ -1532,6 +1624,14 @@ out: return prev ? prev->vm_next : vma; } +static int over_stack_limit(unsigned long sz) +{ + if (sz < EXEC_STACK_BIAS) + return 0; + return (sz - EXEC_STACK_BIAS) > + current->signal->rlim[RLIMIT_STACK].rlim_cur; +} + /* * Verify that the stack growth is acceptable and * update accounting. This is shared with both the @@ -1548,7 +1648,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) + if (over_stack_limit(size)) return -ENOMEM; /* mlock limit tests */ @@ -1858,10 +1958,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - if (new_below) + if (new_below) { + unsigned long old_end = vma->vm_end; + vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); - else + if (vma->vm_flags & VM_EXEC) + arch_remove_exec_range(mm, old_end); + } else vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); return 0; @@ -2110,6 +2214,7 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); + arch_flush_exec_range(mm); /* * Walk the list again, actually closing and freeing it, diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b..9af91a3 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -25,9 +25,14 @@ #include #include #include +#include #include #include +#ifndef arch_remove_exec_range +#define arch_remove_exec_range(mm, limit) do { ; } while (0) +#endif + #ifndef pgprot_modify static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) { @@ -138,7 +143,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, struct mm_struct *mm = vma->vm_mm; unsigned long oldflags = vma->vm_flags; long nrpages = (end - start) >> PAGE_SHIFT; - unsigned long charged = 0; + unsigned long charged = 0, old_end = vma->vm_end; pgoff_t pgoff; int error; int dirty_accountable = 0; @@ -203,6 +208,9 @@ success: dirty_accountable = 1; } + if (oldflags & VM_EXEC) + arch_remove_exec_range(current->mm, old_end); + mmu_notifier_invalidate_range_start(mm, start, end); if (is_vm_hugetlb_page(vma)) hugetlb_change_protection(vma, start, end, vma->vm_page_prot); diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b9..6bebfde 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -400,8 +400,8 @@ unsigned long do_mremap(unsigned long addr, if (vma->vm_flags & VM_MAYSHARE) map_flags |= MAP_SHARED; - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff, map_flags); + new_addr = get_unmapped_area_prot(vma->vm_file, 0, new_len, + vma->vm_pgoff, map_flags, vma->vm_flags & VM_EXEC); if (new_addr & ~PAGE_MASK) { ret = new_addr; goto out;