From 1fe5620fcd6c2f0a4a927ee10c8e53196da392f3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 15 Dec 2014 14:22:46 +0100 Subject: [PATCH 01/39] isofs: Fix infinite looping over CE entries commit f54e18f1b831c92f6512d2eedb224cd63d607d3d upstream. Rock Ridge extensions define so called Continuation Entries (CE) which define where is further space with Rock Ridge data. Corrupted isofs image can contain arbitrarily long chain of these, including a one containing loop and thus causing kernel to end in an infinite loop when traversing these entries. Limit the traversal to 32 entries which should be more than enough space to store all the Rock Ridge data. Reported-by: P J P Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/isofs/rock.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index f488bbae541a..bb63254ed848 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -30,6 +30,7 @@ struct rock_state { int cont_size; int cont_extent; int cont_offset; + int cont_loops; struct inode *inode; }; @@ -73,6 +74,9 @@ static void init_rock_state(struct rock_state *rs, struct inode *inode) rs->inode = inode; } +/* Maximum number of Rock Ridge continuation entries */ +#define RR_MAX_CE_ENTRIES 32 + /* * Returns 0 if the caller should continue scanning, 1 if the scan must end * and -ve on error. @@ -105,6 +109,8 @@ static int rock_continue(struct rock_state *rs) goto out; } ret = -EIO; + if (++rs->cont_loops >= RR_MAX_CE_ENTRIES) + goto out; bh = sb_bread(rs->inode->i_sb, rs->cont_extent); if (bh) { memcpy(rs->buffer, bh->b_data + rs->cont_offset, From 359d2d755e1924fc7231d8423696a7365eccd3e1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Dec 2014 16:48:16 -0800 Subject: [PATCH 02/39] x86/tls: Validate TLS entries to protect espfix commit 41bdc78544b8a93a9c6814b8bbbfef966272abbe upstream. Installing a 16-bit RW data segment into the GDT defeats espfix. AFAICT this will not affect glibc, Wine, or dosemu at all. Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Willy Tarreau Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/tls.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index f7fec09e3e3a..e7650bd71109 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -27,6 +27,21 @@ static int get_free_idx(void) return -ESRCH; } +static bool tls_desc_okay(const struct user_desc *info) +{ + if (LDT_empty(info)) + return true; + + /* + * espfix is required for 16-bit data segments, but espfix + * only works for LDT segments. + */ + if (!info->seg_32bit) + return false; + + return true; +} + static void set_tls_desc(struct task_struct *p, int idx, const struct user_desc *info, int n) { @@ -66,6 +81,9 @@ int do_set_thread_area(struct task_struct *p, int idx, if (copy_from_user(&info, u_info, sizeof(info))) return -EFAULT; + if (!tls_desc_okay(&info)) + return -EINVAL; + if (idx == -1) idx = info.entry_number; @@ -192,6 +210,7 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, { struct user_desc infobuf[GDT_ENTRY_TLS_ENTRIES]; const struct user_desc *info; + int i; if (pos >= GDT_ENTRY_TLS_ENTRIES * sizeof(struct user_desc) || (pos % sizeof(struct user_desc)) != 0 || @@ -205,6 +224,10 @@ int regset_tls_set(struct task_struct *target, const struct user_regset *regset, else info = infobuf; + for (i = 0; i < count / sizeof(struct user_desc); i++) + if (!tls_desc_okay(info + i)) + return -EINVAL; + set_tls_desc(target, GDT_ENTRY_TLS_MIN + (pos / sizeof(struct user_desc)), info, count / sizeof(struct user_desc)); From 84f538365947a5575c6c9d1404fb62386ae994ba Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 4 Dec 2014 16:48:17 -0800 Subject: [PATCH 03/39] x86/tls: Disallow unusual TLS segments commit 0e58af4e1d2166e9e33375a0f121e4867010d4f8 upstream. Users have no business installing custom code segments into the GDT, and segments that are not present but are otherwise valid are a historical source of interesting attacks. For completeness, block attempts to set the L bit. (Prior to this patch, the L bit would have been silently dropped.) This is an ABI break. I've checked glibc, musl, and Wine, and none of them look like they'll have any trouble. Note to stable maintainers: this is a hardening patch that fixes no known bugs. Given the possibility of ABI issues, this probably shouldn't be backported quickly. Signed-off-by: Andy Lutomirski Acked-by: H. Peter Anvin Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Willy Tarreau Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/tls.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index e7650bd71109..3e551eee87b9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -39,6 +39,28 @@ static bool tls_desc_okay(const struct user_desc *info) if (!info->seg_32bit) return false; + /* Only allow data segments in the TLS array. */ + if (info->contents > 1) + return false; + + /* + * Non-present segments with DPL 3 present an interesting attack + * surface. The kernel should handle such segments correctly, + * but TLS is very difficult to protect in a sandbox, so prevent + * such segments from being created. + * + * If userspace needs to remove a TLS entry, it can still delete + * it outright. + */ + if (info->seg_not_present) + return false; + +#ifdef CONFIG_X86_64 + /* The L bit makes no sense for data. */ + if (info->lm) + return false; +#endif + return true; } From cb7977a9a8f74fa555a893c052f82a826cc66231 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 8 Dec 2014 13:55:20 -0800 Subject: [PATCH 04/39] x86_64, switch_to(): Load TLS descriptors before switching DS and ES commit f647d7c155f069c1a068030255c300663516420e upstream. Otherwise, if buggy user code points DS or ES into the TLS array, they would be corrupted after a context switch. This also significantly improves the comments and documents some gotchas in the code. Before this patch, the both tests below failed. With this patch, the es test passes, although the gsbase test still fails. ----- begin es test ----- /* * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned short GDT3(int idx) { return (idx << 3) | 3; } static int create_tls(int idx, unsigned int base) { struct user_desc desc = { .entry_number = idx, .base_addr = base, .limit = 0xfffff, .seg_32bit = 1, .contents = 0, /* Data, grow-up */ .read_exec_only = 0, .limit_in_pages = 1, .seg_not_present = 0, .useable = 0, }; if (syscall(SYS_set_thread_area, &desc) != 0) err(1, "set_thread_area"); return desc.entry_number; } int main() { int idx = create_tls(-1, 0); printf("Allocated GDT index %d\n", idx); unsigned short orig_es; asm volatile ("mov %%es,%0" : "=rm" (orig_es)); int errors = 0; int total = 1000; for (int i = 0; i < total; i++) { asm volatile ("mov %0,%%es" : : "rm" (GDT3(idx))); usleep(100); unsigned short es; asm volatile ("mov %%es,%0" : "=rm" (es)); asm volatile ("mov %0,%%es" : : "rm" (orig_es)); if (es != GDT3(idx)) { if (errors == 0) printf("[FAIL]\tES changed from 0x%hx to 0x%hx\n", GDT3(idx), es); errors++; } } if (errors) { printf("[FAIL]\tES was corrupted %d/%d times\n", errors, total); return 1; } else { printf("[OK]\tES was preserved\n"); return 0; } } ----- end es test ----- ----- begin gsbase test ----- /* * gsbase.c, a gsbase test * Copyright (c) 2014 Andy Lutomirski * GPL v2 */ static unsigned char *testptr, *testptr2; static unsigned char read_gs_testvals(void) { unsigned char ret; asm volatile ("movb %%gs:%1, %0" : "=r" (ret) : "m" (*testptr)); return ret; } int main() { int errors = 0; testptr = mmap((void *)0x200000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr == MAP_FAILED) err(1, "mmap"); testptr2 = mmap((void *)0x300000000UL, 1, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_FIXED | MAP_ANONYMOUS, -1, 0); if (testptr2 == MAP_FAILED) err(1, "mmap"); *testptr = 0; *testptr2 = 1; if (syscall(SYS_arch_prctl, ARCH_SET_GS, (unsigned long)testptr2 - (unsigned long)testptr) != 0) err(1, "ARCH_SET_GS"); usleep(100); if (read_gs_testvals() == 1) { printf("[OK]\tARCH_SET_GS worked\n"); } else { printf("[FAIL]\tARCH_SET_GS failed\n"); errors++; } asm volatile ("mov %0,%%gs" : : "r" (0)); if (read_gs_testvals() == 0) { printf("[OK]\tWriting 0 to gs worked\n"); } else { printf("[FAIL]\tWriting 0 to gs failed\n"); errors++; } usleep(100); if (read_gs_testvals() == 0) { printf("[OK]\tgsbase is still zero\n"); } else { printf("[FAIL]\tgsbase was corrupted\n"); errors++; } return errors == 0 ? 0 : 1; } ----- end gsbase test ----- Signed-off-by: Andy Lutomirski Cc: Andi Kleen Cc: Linus Torvalds Link: http://lkml.kernel.org/r/509d27c9fec78217691c3dad91cec87e1006b34a.1418075657.git.luto@amacapital.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/process_64.c | 101 +++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index f99a242730e9..7099ab1e075b 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -279,24 +279,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* - * Reload esp0, LDT and the page table pointer: - */ + /* Reload esp0 and ss1. */ load_sp0(tss, next); - /* - * Switch DS and ES. - * This won't pick up thread selector changes, but I guess that is ok. - */ - savesegment(es, prev->es); - if (unlikely(next->es | prev->es)) - loadsegment(es, next->es); - - savesegment(ds, prev->ds); - if (unlikely(next->ds | prev->ds)) - loadsegment(ds, next->ds); - - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -305,41 +290,101 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) savesegment(fs, fsindex); savesegment(gs, gsindex); + /* + * Load TLS before restoring any segments so that segment loads + * reference the correct GDT entries. + */ load_TLS(next, cpu); /* - * Leave lazy mode, flushing any hypercalls made here. - * This must be done before restoring TLS segments so - * the GDT and LDT are properly updated, and must be - * done before math_state_restore, so the TS bit is up - * to date. + * Leave lazy mode, flushing any hypercalls made here. This + * must be done after loading TLS entries in the GDT but before + * loading segments that might reference them, and and it must + * be done before math_state_restore, so the TS bit is up to + * date. */ arch_end_context_switch(next_p); + /* Switch DS and ES. + * + * Reading them only returns the selectors, but writing them (if + * nonzero) loads the full descriptor from the GDT or LDT. The + * LDT for next is loaded in switch_mm, and the GDT is loaded + * above. + * + * We therefore need to write new values to the segment + * registers on every context switch unless both the new and old + * values are zero. + * + * Note that we don't need to do anything for CS and SS, as + * those are saved and restored as part of pt_regs. + */ + savesegment(es, prev->es); + if (unlikely(next->es | prev->es)) + loadsegment(es, next->es); + + savesegment(ds, prev->ds); + if (unlikely(next->ds | prev->ds)) + loadsegment(ds, next->ds); + /* * Switch FS and GS. * - * Segment register != 0 always requires a reload. Also - * reload when it has changed. When prev process used 64bit - * base always reload to avoid an information leak. + * These are even more complicated than FS and GS: they have + * 64-bit bases are that controlled by arch_prctl. Those bases + * only differ from the values in the GDT or LDT if the selector + * is 0. + * + * Loading the segment register resets the hidden base part of + * the register to 0 or the value from the GDT / LDT. If the + * next base address zero, writing 0 to the segment register is + * much faster than using wrmsr to explicitly zero the base. + * + * The thread_struct.fs and thread_struct.gs values are 0 + * if the fs and gs bases respectively are not overridden + * from the values implied by fsindex and gsindex. They + * are nonzero, and store the nonzero base addresses, if + * the bases are overridden. + * + * (fs != 0 && fsindex != 0) || (gs != 0 && gsindex != 0) should + * be impossible. + * + * Therefore we need to reload the segment registers if either + * the old or new selector is nonzero, and we need to override + * the base address if next thread expects it to be overridden. + * + * This code is unnecessarily slow in the case where the old and + * new indexes are zero and the new base is nonzero -- it will + * unnecessarily write 0 to the selector before writing the new + * base address. + * + * Note: This all depends on arch_prctl being the only way that + * user code can override the segment base. Once wrfsbase and + * wrgsbase are enabled, most of this code will need to change. */ if (unlikely(fsindex | next->fsindex | prev->fs)) { loadsegment(fs, next->fsindex); + /* - * Check if the user used a selector != 0; if yes - * clear 64bit base, since overloaded base is always - * mapped to the Null selector + * If user code wrote a nonzero value to FS, then it also + * cleared the overridden base address. + * + * XXX: if user code wrote 0 to FS and cleared the base + * address itself, we won't notice and we'll incorrectly + * restore the prior base address next time we reschdule + * the process. */ if (fsindex) prev->fs = 0; } - /* when next process has a 64bit base use it */ if (next->fs) wrmsrl(MSR_FS_BASE, next->fs); prev->fsindex = fsindex; if (unlikely(gsindex | next->gsindex | prev->gs)) { load_gs_index(next->gsindex); + + /* This works (and fails) the same way as fsindex above. */ if (gsindex) prev->gs = 0; } From 9d2b6132e6963ccdfb15a4570984382425b96529 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 5 Dec 2014 19:03:28 -0800 Subject: [PATCH 05/39] x86, kvm: Clear paravirt_enabled on KVM guests for espfix32's benefit commit 29fa6825463c97e5157284db80107d1bfac5d77b upstream. paravirt_enabled has the following effects: - Disables the F00F bug workaround warning. There is no F00F bug workaround any more because Linux's standard IDT handling already works around the F00F bug, but the warning still exists. This is only cosmetic, and, in any event, there is no such thing as KVM on a CPU with the F00F bug. - Disables 32-bit APM BIOS detection. On a KVM paravirt system, there should be no APM BIOS anyway. - Disables tboot. I think that the tboot code should check the CPUID hypervisor bit directly if it matters. - paravirt_enabled disables espfix32. espfix32 should *not* be disabled under KVM paravirt. The last point is the purpose of this patch. It fixes a leak of the high 16 bits of the kernel stack address on 32-bit KVM paravirt guests. Fixes CVE-2014-8134. Suggested-by: Konrad Rzeszutek Wilk Signed-off-by: Andy Lutomirski Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kvm.c | 9 ++++++++- arch/x86/kernel/kvmclock.c | 1 - 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index cd6d9a5a42f6..c4ff2a916139 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -279,7 +279,14 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) static void __init paravirt_ops_setup(void) { pv_info.name = "KVM"; - pv_info.paravirt_enabled = 1; + + /* + * KVM isn't paravirt in the sense of paravirt_enabled. A KVM + * guest kernel works like a bare metal kernel with additional + * features, and paravirt_enabled is about features that are + * missing. + */ + pv_info.paravirt_enabled = 0; if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY)) pv_cpu_ops.io_delay = kvm_io_delay; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3dd37ebd591b..41514f56c241 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -265,7 +265,6 @@ void __init kvmclock_init(void) #endif kvm_get_preset_lpj(); clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); - pv_info.paravirt_enabled = 1; pv_info.name = "KVM"; if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) From 15225559b32c87614b39713875f6d799df05c9b0 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 9 Sep 2014 14:13:51 +1000 Subject: [PATCH 06/39] md/bitmap: always wait for writes on unplug. commit 4b5060ddae2b03c5387321fafc089d242225697a upstream. If two threads call bitmap_unplug at the same time, then one might schedule all the writes, and the other might decide that it doesn't need to wait. But really it does. It rarely hurts to wait when it isn't absolutely necessary, and the current code doesn't really focus on 'absolutely necessary' anyway. So just wait always. This can potentially lead to data corruption if a crash happens at an awkward time and data was written before the bitmap was updated. It is very unlikely, but this should go to -stable just to be safe. Appropriate for any -stable. Signed-off-by: NeilBrown Signed-off-by: Greg Kroah-Hartman --- drivers/md/bitmap.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5a2c75499824..a79cbd6038f6 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -883,7 +883,6 @@ void bitmap_unplug(struct bitmap *bitmap) { unsigned long i; int dirty, need_write; - int wait = 0; if (!bitmap || !bitmap->storage.filemap || test_bit(BITMAP_STALE, &bitmap->flags)) @@ -901,16 +900,13 @@ void bitmap_unplug(struct bitmap *bitmap) clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); write_page(bitmap, bitmap->storage.filemap[i], 0); } - if (dirty) - wait = 1; - } - if (wait) { /* if any writes were performed, we need to wait on them */ - if (bitmap->storage.file) - wait_event(bitmap->write_wait, - atomic_read(&bitmap->pending_writes)==0); - else - md_super_wait(bitmap->mddev); } + if (bitmap->storage.file) + wait_event(bitmap->write_wait, + atomic_read(&bitmap->pending_writes)==0); + else + md_super_wait(bitmap->mddev); + if (test_bit(BITMAP_WRITE_ERROR, &bitmap->flags)) bitmap_file_kick(bitmap); } From 3e6845bed5e8b6b1938d509486e0fe0b953daa24 Mon Sep 17 00:00:00 2001 From: Dmitry Eremin-Solenikov Date: Fri, 24 Oct 2014 21:19:57 +0400 Subject: [PATCH 07/39] mfd: tc6393xb: Fail ohci suspend if full state restore is required commit 1a5fb99de4850cba710d91becfa2c65653048589 upstream. Some boards with TC6393XB chip require full state restore during system resume thanks to chip's VCC being cut off during suspend (Sharp SL-6000 tosa is one of them). Failing to do so would result in ohci Oops on resume due to internal memory contentes being changed. Fail ohci suspend on tc6393xb is full state restore is required. Recommended workaround is to unbind tmio-ohci driver before suspend and rebind it after resume. Signed-off-by: Dmitry Eremin-Solenikov Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- drivers/mfd/tc6393xb.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/mfd/tc6393xb.c b/drivers/mfd/tc6393xb.c index 15e1463e5e13..17fe83e81ea4 100644 --- a/drivers/mfd/tc6393xb.c +++ b/drivers/mfd/tc6393xb.c @@ -263,6 +263,17 @@ static int tc6393xb_ohci_disable(struct platform_device *dev) return 0; } +static int tc6393xb_ohci_suspend(struct platform_device *dev) +{ + struct tc6393xb_platform_data *tcpd = dev_get_platdata(dev->dev.parent); + + /* We can't properly store/restore OHCI state, so fail here */ + if (tcpd->resume_restore) + return -EBUSY; + + return tc6393xb_ohci_disable(dev); +} + static int tc6393xb_fb_enable(struct platform_device *dev) { struct tc6393xb *tc6393xb = dev_get_drvdata(dev->dev.parent); @@ -403,7 +414,7 @@ static struct mfd_cell tc6393xb_cells[] = { .num_resources = ARRAY_SIZE(tc6393xb_ohci_resources), .resources = tc6393xb_ohci_resources, .enable = tc6393xb_ohci_enable, - .suspend = tc6393xb_ohci_disable, + .suspend = tc6393xb_ohci_suspend, .resume = tc6393xb_ohci_enable, .disable = tc6393xb_ohci_disable, }, From 5d316a316386b92dc188b96ff8fc8013a0db4cc7 Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Mon, 22 Sep 2014 10:12:51 +0300 Subject: [PATCH 08/39] mmc: block: add newline to sysfs display of force_ro commit 0031a98a85e9fca282624bfc887f9531b2768396 upstream. Make force_ro consistent with other sysfs entries. Fixes: 371a689f64b0d ('mmc: MMC boot partitions support') Cc: Andrei Warkentin Signed-off-by: Baruch Siach Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/card/block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index 9aca9462a12f..7ad66823d022 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -257,7 +257,7 @@ static ssize_t force_ro_show(struct device *dev, struct device_attribute *attr, int ret; struct mmc_blk_data *md = mmc_blk_get(dev_to_disk(dev)); - ret = snprintf(buf, PAGE_SIZE, "%d", + ret = snprintf(buf, PAGE_SIZE, "%d\n", get_disk_ro(dev_to_disk(dev)) ^ md->read_only); mmc_blk_put(md); From 72f7a5c969dd7c3f5958e14eb97ee5022fe3dba5 Mon Sep 17 00:00:00 2001 From: "Sumit.Saxena@avagotech.com" Date: Mon, 17 Nov 2014 15:24:23 +0530 Subject: [PATCH 09/39] megaraid_sas: corrected return of wait_event from abort frame path commit 170c238701ec38b1829321b17c70671c101bac55 upstream. Corrected wait_event() call which was waiting for wrong completion status (0xFF). Signed-off-by: Sumit Saxena Signed-off-by: Kashyap Desai Reviewed-by: Tomas Henzl Signed-off-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 4956c99ed90e..78b4fe845245 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -933,7 +933,7 @@ megasas_issue_blocked_abort_cmd(struct megasas_instance *instance, abort_fr->abort_mfi_phys_addr_hi = 0; cmd->sync_cmd = 1; - cmd->cmd_status = 0xFF; + cmd->cmd_status = ENODATA; instance->instancet->issue_dcmd(instance, cmd); From f30d637d028b4a5344d4651f30c486f9daec6804 Mon Sep 17 00:00:00 2001 From: Peng Tao Date: Mon, 17 Nov 2014 11:05:17 +0800 Subject: [PATCH 10/39] nfs41: fix nfs4_proc_layoutget error handling commit 4bd5a980de87d2b5af417485bde97b8eb3d6cf6a upstream. nfs4_layoutget_release() drops layout hdr refcnt. Grab the refcnt early so that it is safe to call .release in case nfs4_alloc_pages fails. Signed-off-by: Peng Tao Fixes: a47970ff78147 ("NFSv4.1: Hold reference to layout hdr in layoutget") Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/nfs4proc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 78787948f69d..20ebcfa3c92e 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6418,6 +6418,9 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) dprintk("--> %s\n", __func__); + /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ + pnfs_get_layout_hdr(NFS_I(inode)->layout); + lgp->args.layout.pages = nfs4_alloc_pages(max_pages, gfp_flags); if (!lgp->args.layout.pages) { nfs4_layoutget_release(lgp); @@ -6430,9 +6433,6 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) lgp->res.seq_res.sr_slot = NULL; nfs41_init_sequence(&lgp->args.seq_args, &lgp->res.seq_res, 0); - /* nfs4_layoutget_release calls pnfs_put_layout_hdr */ - pnfs_get_layout_hdr(NFS_I(inode)->layout); - task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return ERR_CAST(task); From e1102c5b030bb0f41bed2f9120695ce0b0165c13 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Tue, 25 Nov 2014 17:45:15 -0800 Subject: [PATCH 11/39] dm bufio: fix memleak when using a dm_buffer's inline bio commit 445559cdcb98a141f5de415b94fd6eaccab87e6d upstream. When dm-bufio sets out to use the bio built into a struct dm_buffer to issue an IO, it needs to call bio_reset after it's done with the bio so that we can free things attached to the bio such as the integrity payload. Therefore, inject our own endio callback to take care of the bio_reset after calling submit_io's end_io callback. Test case: 1. modprobe scsi_debug delay=0 dif=1 dix=199 ato=1 dev_size_mb=300 2. Set up a dm-bufio client, e.g. dm-verity, on the scsi_debug device 3. Repeatedly read metadata and watch kmalloc-192 leak! Signed-off-by: Darrick J. Wong Signed-off-by: Mikulas Patocka Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-bufio.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index c9b4ca9e0696..e855a190270d 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -529,6 +529,19 @@ static void use_dmio(struct dm_buffer *b, int rw, sector_t block, end_io(&b->bio, r); } +static void inline_endio(struct bio *bio, int error) +{ + bio_end_io_t *end_fn = bio->bi_private; + + /* + * Reset the bio to free any attached resources + * (e.g. bio integrity profiles). + */ + bio_reset(bio); + + end_fn(bio, error); +} + static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, bio_end_io_t *end_io) { @@ -540,7 +553,12 @@ static void use_inline_bio(struct dm_buffer *b, int rw, sector_t block, b->bio.bi_max_vecs = DM_BUFIO_INLINE_VECS; b->bio.bi_sector = block << b->c->sectors_per_block_bits; b->bio.bi_bdev = b->c->bdev; - b->bio.bi_end_io = end_io; + b->bio.bi_end_io = inline_endio; + /* + * Use of .bi_private isn't a problem here because + * the dm_buffer's inline bio is local to bufio. + */ + b->bio.bi_private = end_io; /* * We assume that if len >= PAGE_SIZE ptr is page-aligned. From 81f250e1cf8048e4780e46d5219b0e7f943a336b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 29 Nov 2014 15:50:21 +0300 Subject: [PATCH 12/39] dm space map metadata: fix sm_bootstrap_get_nr_blocks() commit c1c6156fe4d4577444b769d7edd5dd503e57bbc9 upstream. This function isn't right and it causes a static checker warning: drivers/md/dm-thin.c:3016 maybe_resize_data_dev() error: potentially using uninitialized 'sb_data_size'. It should set "*count" and return zero on success the same as the sm_metadata_get_nr_blocks() function does earlier. Fixes: 3241b1d3e0aa ('dm: add persistent data library') Signed-off-by: Dan Carpenter Acked-by: Joe Thornber Signed-off-by: Mike Snitzer Signed-off-by: Greg Kroah-Hartman --- drivers/md/persistent-data/dm-space-map-metadata.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c index afb419e514bf..056d09c33af1 100644 --- a/drivers/md/persistent-data/dm-space-map-metadata.c +++ b/drivers/md/persistent-data/dm-space-map-metadata.c @@ -493,7 +493,9 @@ static int sm_bootstrap_get_nr_blocks(struct dm_space_map *sm, dm_block_t *count { struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); - return smm->ll.nr_blocks; + *count = smm->ll.nr_blocks; + + return 0; } static int sm_bootstrap_get_nr_free(struct dm_space_map *sm, dm_block_t *count) From d011d586b3f9f1b08f3deb080d8542f56963dc28 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 17 Dec 2014 14:48:30 -0800 Subject: [PATCH 13/39] x86/tls: Don't validate lm in set_thread_area() after all commit 3fb2f4237bb452eb4e98f6a5dbd5a445b4fed9d0 upstream. It turns out that there's a lurking ABI issue. GCC, when compiling this in a 32-bit program: struct user_desc desc = { .entry_number = idx, .base_addr = base, .limit = 0xfffff, .seg_32bit = 1, .contents = 0, /* Data, grow-up */ .read_exec_only = 0, .limit_in_pages = 1, .seg_not_present = 0, .useable = 0, }; will leave .lm uninitialized. This means that anything in the kernel that reads user_desc.lm for 32-bit tasks is unreliable. Revert the .lm check in set_thread_area(). The value never did anything in the first place. Fixes: 0e58af4e1d21 ("x86/tls: Disallow unusual TLS segments") Signed-off-by: Andy Lutomirski Acked-by: Thomas Gleixner Cc: Linus Torvalds Link: http://lkml.kernel.org/r/d7875b60e28c512f6a6fc0baf5714d58e7eaadbb.1418856405.git.luto@amacapital.net Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/uapi/asm/ldt.h | 7 +++++++ arch/x86/kernel/tls.c | 6 ------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/uapi/asm/ldt.h b/arch/x86/include/uapi/asm/ldt.h index 46727eb37bfe..6e1aaf73852a 100644 --- a/arch/x86/include/uapi/asm/ldt.h +++ b/arch/x86/include/uapi/asm/ldt.h @@ -28,6 +28,13 @@ struct user_desc { unsigned int seg_not_present:1; unsigned int useable:1; #ifdef __x86_64__ + /* + * Because this bit is not present in 32-bit user code, user + * programs can pass uninitialized values here. Therefore, in + * any context in which a user_desc comes from a 32-bit program, + * the kernel must act as though lm == 0, regardless of the + * actual value. + */ unsigned int lm:1; #endif }; diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index 3e551eee87b9..4e942f31b1a7 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -55,12 +55,6 @@ static bool tls_desc_okay(const struct user_desc *info) if (info->seg_not_present) return false; -#ifdef CONFIG_X86_64 - /* The L bit makes no sense for data. */ - if (info->lm) - return false; -#endif - return true; } From 684f4c093f182756a1c1f582c415d3120cc7f5e8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 18 Dec 2014 17:26:10 +0100 Subject: [PATCH 14/39] isofs: Fix unchecked printing of ER records commit 4e2024624e678f0ebb916e6192bd23c1f9fdf696 upstream. We didn't check length of rock ridge ER records before printing them. Thus corrupted isofs image can cause us to access and print some memory behind the buffer with obvious consequences. Reported-and-tested-by: Carl Henrik Lunde Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/isofs/rock.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index bb63254ed848..735d7522a3a9 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -362,6 +362,9 @@ repeat: rs.cont_size = isonum_733(rr->u.CE.size); break; case SIG('E', 'R'): + /* Invalid length of ER tag id? */ + if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len) + goto out; ISOFS_SB(inode->i_sb)->s_rock = 1; printk(KERN_DEBUG "ISO 9660 Extensions: "); { From 18b41bd86e6cdcdc3b4cef920a1f3d6f496634e3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 4 Dec 2014 18:25:19 +0100 Subject: [PATCH 15/39] KEYS: Fix stale key registration at error path commit b26bdde5bb27f3f900e25a95e33a0c476c8c2c48 upstream. When loading encrypted-keys module, if the last check of aes_get_sizes() in init_encrypted() fails, the driver just returns an error without unregistering its key type. This results in the stale entry in the list. In addition to memory leaks, this leads to a kernel crash when registering a new key type later. This patch fixes the problem by swapping the calls of aes_get_sizes() and register_key_type(), and releasing resources properly at the error paths. Bugzilla: https://bugzilla.opensuse.org/show_bug.cgi?id=908163 Signed-off-by: Takashi Iwai Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman --- security/keys/encrypted-keys/encrypted.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/security/keys/encrypted-keys/encrypted.c b/security/keys/encrypted-keys/encrypted.c index 9e1e005c7596..c4c8df4b214d 100644 --- a/security/keys/encrypted-keys/encrypted.c +++ b/security/keys/encrypted-keys/encrypted.c @@ -1018,10 +1018,13 @@ static int __init init_encrypted(void) ret = encrypted_shash_alloc(); if (ret < 0) return ret; + ret = aes_get_sizes(); + if (ret < 0) + goto out; ret = register_key_type(&key_type_encrypted); if (ret < 0) goto out; - return aes_get_sizes(); + return 0; out: encrypted_shash_release(); return ret; From 47e60160ef39bea01a3eb5d3aa1da945400fa76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20M=C3=BCller?= Date: Fri, 12 Dec 2014 12:11:11 +0100 Subject: [PATCH 16/39] mac80211: fix multicast LED blinking and counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d025933e29872cb1fe19fc54d80e4dfa4ee5779c upstream. As multicast-frames can't be fragmented, "dot11MulticastReceivedFrameCount" stopped being incremented after the use-after-free fix. Furthermore, the RX-LED will be triggered by every multicast frame (which wouldn't happen before) which wouldn't allow the LED to rest at all. Fixes https://bugzilla.kernel.org/show_bug.cgi?id=89431 which also had the patch. Fixes: b8fff407a180 ("mac80211: fix use-after-free in defragmentation") Signed-off-by: Andreas Müller [rewrite commit message] Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/rx.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 85bc6d498b46..9299a38c372e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1585,14 +1585,14 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) sc = le16_to_cpu(hdr->seq_ctrl); frag = sc & IEEE80211_SCTL_FRAG; - if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) - goto out; - if (is_multicast_ether_addr(hdr->addr1)) { rx->local->dot11MulticastReceivedFrameCount++; - goto out; + goto out_no_led; } + if (likely(!ieee80211_has_morefrags(fc) && frag == 0)) + goto out; + I802_DEBUG_INC(rx->local->rx_handlers_fragments); if (skb_linearize(rx->skb)) @@ -1683,9 +1683,10 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) status->rx_flags |= IEEE80211_RX_FRAGMENTED; out: + ieee80211_led_rx(rx->local); + out_no_led: if (rx->sta) rx->sta->rx_packets++; - ieee80211_led_rx(rx->local); return RX_CONTINUE; } From 5f37daa4cbd955afb1eba2909bf24daea20fd37c Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 17 Dec 2014 13:55:49 +0100 Subject: [PATCH 17/39] mac80211: free management frame keys when removing station commit 28a9bc68124c319b2b3dc861e80828a8865fd1ba upstream. When writing the code to allow per-station GTKs, I neglected to take into account the management frame keys (index 4 and 5) when freeing the station and only added code to free the first four data frame keys. Fix this by iterating the array of keys over the right length. Fixes: e31b82136d1a ("cfg80211/mac80211: allow per-station GTKs") Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/key.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 67059b88fea5..635d0972b688 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -607,7 +607,7 @@ void ieee80211_free_sta_keys(struct ieee80211_local *local, int i; mutex_lock(&local->key_mtx); - for (i = 0; i < NUM_DEFAULT_KEYS; i++) { + for (i = 0; i < ARRAY_SIZE(sta->gtk); i++) { key = key_mtx_dereference(local, sta->gtk[i]); if (!key) continue; From 4be461b16020beb18a0cd680ec74ba63f83b677c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 13 Aug 2014 01:33:38 -0700 Subject: [PATCH 18/39] mnt: Implicitly add MNT_NODEV on remount when it was implicitly added by mount commit 3e1866410f11356a9fd869beb3e95983dc79c067 upstream. Now that remount is properly enforcing the rule that you can't remove nodev at least sandstorm.io is breaking when performing a remount. It turns out that there is an easy intuitive solution implicitly add nodev on remount when nodev was implicitly added on mount. Tested-by: Cedric Bosdonnat Tested-by: Richard Weinberger Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 154822397780..fb2d1ad022bf 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1816,7 +1816,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags, } if ((mnt->mnt.mnt_flags & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) { - return -EPERM; + /* Was the nodev implicitly added in mount? */ + if ((mnt->mnt_ns->user_ns != &init_user_ns) && + !(sb->s_type->fs_flags & FS_USERNS_DEV_MOUNT)) { + mnt_flags |= MNT_NODEV; + } else { + return -EPERM; + } } if ((mnt->mnt.mnt_flags & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) { From 260cb8f431389643881f136d62838334de5fc327 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 22 Aug 2014 16:39:03 -0500 Subject: [PATCH 19/39] mnt: Update unprivileged remount test commit 4a44a19b470a886997d6647a77bb3e38dcbfa8c5 upstream. - MNT_NODEV should be irrelevant except when reading back mount flags, no longer specify MNT_NODEV on remount. - Test MNT_NODEV on devpts where it is meaningful even for unprivileged mounts. - Add a test to verify that remount of a prexisting mount with the same flags is allowed and does not change those flags. - Cleanup up the definitions of MS_REC, MS_RELATIME, MS_STRICTATIME that are used when the code is built in an environment without them. - Correct the test error messages when tests fail. There were not 5 tests that tested MS_RELATIME. Signed-off-by: Eric W. Biederman Signed-off-by: Greg Kroah-Hartman --- .../mount/unprivileged-remount-test.c | 172 +++++++++++++++--- 1 file changed, 142 insertions(+), 30 deletions(-) diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c index 1b3ff2fda4d0..9669d375625a 100644 --- a/tools/testing/selftests/mount/unprivileged-remount-test.c +++ b/tools/testing/selftests/mount/unprivileged-remount-test.c @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include #include @@ -32,11 +34,14 @@ # define CLONE_NEWPID 0x20000000 #endif +#ifndef MS_REC +# define MS_REC 16384 +#endif #ifndef MS_RELATIME -#define MS_RELATIME (1 << 21) +# define MS_RELATIME (1 << 21) #endif #ifndef MS_STRICTATIME -#define MS_STRICTATIME (1 << 24) +# define MS_STRICTATIME (1 << 24) #endif static void die(char *fmt, ...) @@ -87,6 +92,45 @@ static void write_file(char *filename, char *fmt, ...) } } +static int read_mnt_flags(const char *path) +{ + int ret; + struct statvfs stat; + int mnt_flags; + + ret = statvfs(path, &stat); + if (ret != 0) { + die("statvfs of %s failed: %s\n", + path, strerror(errno)); + } + if (stat.f_flag & ~(ST_RDONLY | ST_NOSUID | ST_NODEV | \ + ST_NOEXEC | ST_NOATIME | ST_NODIRATIME | ST_RELATIME | \ + ST_SYNCHRONOUS | ST_MANDLOCK)) { + die("Unrecognized mount flags\n"); + } + mnt_flags = 0; + if (stat.f_flag & ST_RDONLY) + mnt_flags |= MS_RDONLY; + if (stat.f_flag & ST_NOSUID) + mnt_flags |= MS_NOSUID; + if (stat.f_flag & ST_NODEV) + mnt_flags |= MS_NODEV; + if (stat.f_flag & ST_NOEXEC) + mnt_flags |= MS_NOEXEC; + if (stat.f_flag & ST_NOATIME) + mnt_flags |= MS_NOATIME; + if (stat.f_flag & ST_NODIRATIME) + mnt_flags |= MS_NODIRATIME; + if (stat.f_flag & ST_RELATIME) + mnt_flags |= MS_RELATIME; + if (stat.f_flag & ST_SYNCHRONOUS) + mnt_flags |= MS_SYNCHRONOUS; + if (stat.f_flag & ST_MANDLOCK) + mnt_flags |= ST_MANDLOCK; + + return mnt_flags; +} + static void create_and_enter_userns(void) { uid_t uid; @@ -118,7 +162,8 @@ static void create_and_enter_userns(void) } static -bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) +bool test_unpriv_remount(const char *fstype, const char *mount_options, + int mount_flags, int remount_flags, int invalid_flags) { pid_t child; @@ -151,9 +196,11 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) strerror(errno)); } - if (mount("testing", "/tmp", "ramfs", mount_flags, NULL) != 0) { - die("mount of /tmp failed: %s\n", - strerror(errno)); + if (mount("testing", "/tmp", fstype, mount_flags, mount_options) != 0) { + die("mount of %s with options '%s' on /tmp failed: %s\n", + fstype, + mount_options? mount_options : "", + strerror(errno)); } create_and_enter_userns(); @@ -181,62 +228,127 @@ bool test_unpriv_remount(int mount_flags, int remount_flags, int invalid_flags) static bool test_unpriv_remount_simple(int mount_flags) { - return test_unpriv_remount(mount_flags, mount_flags, 0); + return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, 0); } static bool test_unpriv_remount_atime(int mount_flags, int invalid_flags) { - return test_unpriv_remount(mount_flags, mount_flags, invalid_flags); + return test_unpriv_remount("ramfs", NULL, mount_flags, mount_flags, + invalid_flags); +} + +static bool test_priv_mount_unpriv_remount(void) +{ + pid_t child; + int ret; + const char *orig_path = "/dev"; + const char *dest_path = "/tmp"; + int orig_mnt_flags, remount_mnt_flags; + + child = fork(); + if (child == -1) { + die("fork failed: %s\n", + strerror(errno)); + } + if (child != 0) { /* parent */ + pid_t pid; + int status; + pid = waitpid(child, &status, 0); + if (pid == -1) { + die("waitpid failed: %s\n", + strerror(errno)); + } + if (pid != child) { + die("waited for %d got %d\n", + child, pid); + } + if (!WIFEXITED(status)) { + die("child did not terminate cleanly\n"); + } + return WEXITSTATUS(status) == EXIT_SUCCESS ? true : false; + } + + orig_mnt_flags = read_mnt_flags(orig_path); + + create_and_enter_userns(); + ret = unshare(CLONE_NEWNS); + if (ret != 0) { + die("unshare(CLONE_NEWNS) failed: %s\n", + strerror(errno)); + } + + ret = mount(orig_path, dest_path, "bind", MS_BIND | MS_REC, NULL); + if (ret != 0) { + die("recursive bind mount of %s onto %s failed: %s\n", + orig_path, dest_path, strerror(errno)); + } + + ret = mount(dest_path, dest_path, "none", + MS_REMOUNT | MS_BIND | orig_mnt_flags , NULL); + if (ret != 0) { + /* system("cat /proc/self/mounts"); */ + die("remount of /tmp failed: %s\n", + strerror(errno)); + } + + remount_mnt_flags = read_mnt_flags(dest_path); + if (orig_mnt_flags != remount_mnt_flags) { + die("Mount flags unexpectedly changed during remount of %s originally mounted on %s\n", + dest_path, orig_path); + } + exit(EXIT_SUCCESS); } int main(int argc, char **argv) { - if (!test_unpriv_remount_simple(MS_RDONLY|MS_NODEV)) { + if (!test_unpriv_remount_simple(MS_RDONLY)) { die("MS_RDONLY malfunctions\n"); } - if (!test_unpriv_remount_simple(MS_NODEV)) { + if (!test_unpriv_remount("devpts", "newinstance", MS_NODEV, MS_NODEV, 0)) { die("MS_NODEV malfunctions\n"); } - if (!test_unpriv_remount_simple(MS_NOSUID|MS_NODEV)) { + if (!test_unpriv_remount_simple(MS_NOSUID)) { die("MS_NOSUID malfunctions\n"); } - if (!test_unpriv_remount_simple(MS_NOEXEC|MS_NODEV)) { + if (!test_unpriv_remount_simple(MS_NOEXEC)) { die("MS_NOEXEC malfunctions\n"); } - if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODEV, - MS_NOATIME|MS_NODEV)) + if (!test_unpriv_remount_atime(MS_RELATIME, + MS_NOATIME)) { die("MS_RELATIME malfunctions\n"); } - if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODEV, - MS_NOATIME|MS_NODEV)) + if (!test_unpriv_remount_atime(MS_STRICTATIME, + MS_NOATIME)) { die("MS_STRICTATIME malfunctions\n"); } - if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODEV, - MS_STRICTATIME|MS_NODEV)) + if (!test_unpriv_remount_atime(MS_NOATIME, + MS_STRICTATIME)) { - die("MS_RELATIME malfunctions\n"); + die("MS_NOATIME malfunctions\n"); } - if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME|MS_NODEV, - MS_NOATIME|MS_NODEV)) + if (!test_unpriv_remount_atime(MS_RELATIME|MS_NODIRATIME, + MS_NOATIME)) { - die("MS_RELATIME malfunctions\n"); + die("MS_RELATIME|MS_NODIRATIME malfunctions\n"); } - if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME|MS_NODEV, - MS_NOATIME|MS_NODEV)) + if (!test_unpriv_remount_atime(MS_STRICTATIME|MS_NODIRATIME, + MS_NOATIME)) { - die("MS_RELATIME malfunctions\n"); + die("MS_STRICTATIME|MS_NODIRATIME malfunctions\n"); } - if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME|MS_NODEV, - MS_STRICTATIME|MS_NODEV)) + if (!test_unpriv_remount_atime(MS_NOATIME|MS_NODIRATIME, + MS_STRICTATIME)) { - die("MS_RELATIME malfunctions\n"); + die("MS_NOATIME|MS_DIRATIME malfunctions\n"); } - if (!test_unpriv_remount(MS_STRICTATIME|MS_NODEV, MS_NODEV, - MS_NOATIME|MS_NODEV)) + if (!test_unpriv_remount("ramfs", NULL, MS_STRICTATIME, 0, MS_NOATIME)) { die("Default atime malfunctions\n"); } + if (!test_priv_mount_unpriv_remount()) { + die("Mount flags unexpectedly changed after remount\n"); + } return EXIT_SUCCESS; } From c65d3b05d20c15f4421f853cbd2d41b91a12185e Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 4 Oct 2014 14:44:03 -0700 Subject: [PATCH 20/39] umount: Disallow unprivileged mount force commit b2f5d4dc38e034eecb7987e513255265ff9aa1cf upstream. Forced unmount affects not just the mount namespace but the underlying superblock as well. Restrict forced unmount to the global root user for now. Otherwise it becomes possible a user in a less privileged mount namespace to force the shutdown of a superblock of a filesystem in a more privileged mount namespace, allowing a DOS attack on root. Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/namespace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index fb2d1ad022bf..d0244c8ba09c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1342,6 +1342,9 @@ SYSCALL_DEFINE2(umount, char __user *, name, int, flags) goto dput_and_out; if (!check_mnt(mnt)) goto dput_and_out; + retval = -EPERM; + if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + goto dput_and_out; retval = do_umount(mnt, flags); dput_and_out: From 4accc8c8e2dbb1f5ff3d8836244a9423bd5e11a9 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Dec 2014 17:19:27 -0600 Subject: [PATCH 21/39] groups: Consolidate the setgroups permission checks commit 7ff4d90b4c24a03666f296c3d4878cd39001e81e upstream. Today there are 3 instances of setgroups and due to an oversight their permission checking has diverged. Add a common function so that they may all share the same permission checking code. This corrects the current oversight in the current permission checks and adds a helper to avoid this in the future. A user namespace security fix will update this new helper, shortly. Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- arch/s390/kernel/compat_linux.c | 2 +- include/linux/cred.h | 1 + kernel/groups.c | 9 ++++++++- kernel/uid16.c | 2 +- 4 files changed, 11 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c index 8b6e4f5288a2..a98afed9348b 100644 --- a/arch/s390/kernel/compat_linux.c +++ b/arch/s390/kernel/compat_linux.c @@ -248,7 +248,7 @@ asmlinkage long sys32_setgroups16(int gidsetsize, u16 __user *grouplist) struct group_info *group_info; int retval; - if (!capable(CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/include/linux/cred.h b/include/linux/cred.h index 04421e825365..6c58dd7cb9ac 100644 --- a/include/linux/cred.h +++ b/include/linux/cred.h @@ -68,6 +68,7 @@ extern void groups_free(struct group_info *); extern int set_current_groups(struct group_info *); extern int set_groups(struct cred *, struct group_info *); extern int groups_search(const struct group_info *, kgid_t); +extern bool may_setgroups(void); /* access the groups "array" with this macro */ #define GROUP_AT(gi, i) \ diff --git a/kernel/groups.c b/kernel/groups.c index 6b2588dd04ff..984bb629c68c 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -223,6 +223,13 @@ out: return i; } +bool may_setgroups(void) +{ + struct user_namespace *user_ns = current_user_ns(); + + return ns_capable(user_ns, CAP_SETGID); +} + /* * SMP: Our groups are copy-on-write. We can set them safely * without another task interfering. @@ -233,7 +240,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!nsown_capable(CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; diff --git a/kernel/uid16.c b/kernel/uid16.c index f6c83d7ef000..d58cc4d8f0d1 100644 --- a/kernel/uid16.c +++ b/kernel/uid16.c @@ -176,7 +176,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist) struct group_info *group_info; int retval; - if (!nsown_capable(CAP_SETGID)) + if (!may_setgroups()) return -EPERM; if ((unsigned)gidsetsize > NGROUPS_MAX) return -EINVAL; From b8a0441b542f6d6bd6fda46cc735ae71392cb845 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Dec 2014 17:51:47 -0600 Subject: [PATCH 22/39] userns: Document what the invariant required for safe unprivileged mappings. commit 0542f17bf2c1f2430d368f44c8fcf2f82ec9e53e upstream. The rule is simple. Don't allow anything that wouldn't be allowed without unprivileged mappings. It was previously overlooked that establishing gid mappings would allow dropping groups and potentially gaining permission to files and directories that had lesser permissions for a specific group than for all other users. This is the rule needed to fix CVE-2014-8989 and prevent any other security issues with new_idmap_permitted. The reason for this rule is that the unix permission model is old and there are programs out there somewhere that take advantage of every little corner of it. So allowing a uid or gid mapping to be established without privielge that would allow anything that would not be allowed without that mapping will result in expectations from some code somewhere being violated. Violated expectations about the behavior of the OS is a long way to say a security issue. Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bea1d7dd21f..59790206ca2e 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -800,7 +800,9 @@ static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { - /* Allow mapping to your own filesystem ids */ + /* Don't allow mappings that would allow anything that wouldn't + * be allowed without the establishment of unprivileged mappings. + */ if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { From fc9b65e3d7703e6d63875b0b233bbe26a4a513ba Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Dec 2014 18:01:11 -0600 Subject: [PATCH 23/39] userns: Don't allow setgroups until a gid mapping has been setablished commit 273d2c67c3e179adb1e74f403d1e9a06e3f841b5 upstream. setgroups is unique in not needing a valid mapping before it can be called, in the case of setgroups(0, NULL) which drops all supplemental groups. The design of the user namespace assumes that CAP_SETGID can not actually be used until a gid mapping is established. Therefore add a helper function to see if the user namespace gid mapping has been established and call that function in the setgroups permission check. This is part of the fix for CVE-2014-8989, being able to drop groups without privilege using user namespaces. Reviewed-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- include/linux/user_namespace.h | 5 +++++ kernel/groups.c | 4 +++- kernel/user_namespace.c | 14 ++++++++++++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 14105c26a836..648cf88c5d6d 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -59,6 +59,7 @@ extern struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); +extern bool userns_may_setgroups(const struct user_namespace *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) @@ -83,6 +84,10 @@ static inline void put_user_ns(struct user_namespace *ns) { } +static inline bool userns_may_setgroups(const struct user_namespace *ns) +{ + return true; +} #endif void update_mnt_policy(struct user_namespace *userns); diff --git a/kernel/groups.c b/kernel/groups.c index 984bb629c68c..67b4ba30475f 100644 --- a/kernel/groups.c +++ b/kernel/groups.c @@ -6,6 +6,7 @@ #include #include #include +#include #include /* init to 2 - one for init_task, one to ensure it is never freed */ @@ -227,7 +228,8 @@ bool may_setgroups(void) { struct user_namespace *user_ns = current_user_ns(); - return ns_capable(user_ns, CAP_SETGID); + return ns_capable(user_ns, CAP_SETGID) && + userns_may_setgroups(user_ns); } /* diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 59790206ca2e..b083fa233daa 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -832,6 +832,20 @@ static bool new_idmap_permitted(const struct file *file, return false; } +bool userns_may_setgroups(const struct user_namespace *ns) +{ + bool allowed; + + mutex_lock(&id_map_mutex); + /* It is not safe to use setgroups until a gid mapping in + * the user namespace has been established. + */ + allowed = ns->gid_map.nr_extents != 0; + mutex_unlock(&id_map_mutex); + + return allowed; +} + static void *userns_get(struct task_struct *task) { struct user_namespace *user_ns; From ba0922adbd2ccffe444608298ae0506401eac4c3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Dec 2014 18:14:19 -0600 Subject: [PATCH 24/39] userns: Don't allow unprivileged creation of gid mappings commit be7c6dba2332cef0677fbabb606e279ae76652c3 upstream. As any gid mapping will allow and must allow for backwards compatibility dropping groups don't allow any gid mappings to be established without CAP_SETGID in the parent user namespace. For a small class of applications this change breaks userspace and removes useful functionality. This small class of applications includes tools/testing/selftests/mount/unprivilged-remount-test.c Most of the removed functionality will be added back with the addition of a one way knob to disable setgroups. Once setgroups is disabled setting the gid_map becomes as safe as setting the uid_map. For more common applications that set the uid_map and the gid_map with privilege this change will have no affect. This is part of a fix for CVE-2014-8989. Reviewed-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b083fa233daa..075020fa22fd 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -810,11 +810,6 @@ static bool new_idmap_permitted(const struct file *file, if (uid_eq(uid, file->f_cred->fsuid)) return true; } - else if (cap_setid == CAP_SETGID) { - kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, file->f_cred->fsgid)) - return true; - } } /* Allow anyone to set a mapping that doesn't require privilege */ From f028f2d73293b65a5e58ee7468a8683b39fd912c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Dec 2014 18:26:30 -0600 Subject: [PATCH 25/39] userns: Check euid no fsuid when establishing an unprivileged uid mapping commit 80dd00a23784b384ccea049bfb3f259d3f973b9d upstream. setresuid allows the euid to be set to any of uid, euid, suid, and fsuid. Therefor it is safe to allow an unprivileged user to map their euid and use CAP_SETUID privileged with exactly that uid, as no new credentials can be obtained. I can not find a combination of existing system calls that allows setting uid, euid, suid, and fsuid from the fsuid making the previous use of fsuid for allowing unprivileged mappings a bug. This is part of a fix for CVE-2014-8989. Reviewed-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 075020fa22fd..592ab70df216 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -807,7 +807,7 @@ static bool new_idmap_permitted(const struct file *file, u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, file->f_cred->fsuid)) + if (uid_eq(uid, file->f_cred->euid)) return true; } } From a1821391e5072029118857f6ebb27f3cf66b9f33 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 26 Nov 2014 23:22:14 -0600 Subject: [PATCH 26/39] userns: Only allow the creator of the userns unprivileged mappings commit f95d7918bd1e724675de4940039f2865e5eec5fe upstream. If you did not create the user namespace and are allowed to write to uid_map or gid_map you should already have the necessary privilege in the parent user namespace to establish any mapping you want so this will not affect userspace in practice. Limiting unprivileged uid mapping establishment to the creator of the user namespace makes it easier to verify all credentials obtained with the uid mapping can be obtained without the uid mapping without privilege. Limiting unprivileged gid mapping establishment (which is temporarily absent) to the creator of the user namespace also ensures that the combination of uid and gid can already be obtained without privilege. This is part of the fix for CVE-2014-8989. Reviewed-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 592ab70df216..927d4ea4cd0b 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -800,14 +800,16 @@ static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { + const struct cred *cred = file->f_cred; /* Don't allow mappings that would allow anything that wouldn't * be allowed without the establishment of unprivileged mappings. */ - if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1)) { + if ((new_map->nr_extents == 1) && (new_map->extent[0].count == 1) && + uid_eq(ns->owner, cred->euid)) { u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, file->f_cred->euid)) + if (uid_eq(uid, cred->euid)) return true; } } From 6eaab78729f4e689a86ce6bc4e33453ebff9625d Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 9 Dec 2014 14:03:14 -0600 Subject: [PATCH 27/39] userns: Rename id_map_mutex to userns_state_mutex commit f0d62aec931e4ae3333c797d346dc4f188f454ba upstream. Generalize id_map_mutex so it can be used for more state of a user namespace. Reviewed-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 927d4ea4cd0b..78009bd5efe6 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -24,6 +24,7 @@ #include static struct kmem_cache *user_ns_cachep __read_mostly; +static DEFINE_MUTEX(userns_state_mutex); static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, @@ -577,9 +578,6 @@ static bool mappings_overlap(struct uid_gid_map *new_map, struct uid_gid_extent return false; } - -static DEFINE_MUTEX(id_map_mutex); - static ssize_t map_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos, int cap_setid, @@ -596,7 +594,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ssize_t ret = -EINVAL; /* - * The id_map_mutex serializes all writes to any given map. + * The userns_state_mutex serializes all writes to any given map. * * Any map is only ever written once. * @@ -614,7 +612,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, * order and smp_rmb() is guaranteed that we don't have crazy * architectures returning stale data. */ - mutex_lock(&id_map_mutex); + mutex_lock(&userns_state_mutex); ret = -EPERM; /* Only allow one successful write to the map */ @@ -741,7 +739,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, *ppos = count; ret = count; out: - mutex_unlock(&id_map_mutex); + mutex_unlock(&userns_state_mutex); if (page) free_page(page); return ret; @@ -833,12 +831,12 @@ bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; - mutex_lock(&id_map_mutex); + mutex_lock(&userns_state_mutex); /* It is not safe to use setgroups until a gid mapping in * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; - mutex_unlock(&id_map_mutex); + mutex_unlock(&userns_state_mutex); return allowed; } From 1c587ee50ec44a2cf07cd160c214a683608b4d2c Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Dec 2014 12:27:26 -0600 Subject: [PATCH 28/39] userns: Add a knob to disable setgroups on a per user namespace basis commit 9cc46516ddf497ea16e8d7cb986ae03a0f6b92f8 upstream. - Expose the knob to user space through a proc file /proc//setgroups A value of "deny" means the setgroups system call is disabled in the current processes user namespace and can not be enabled in the future in this user namespace. A value of "allow" means the segtoups system call is enabled. - Descendant user namespaces inherit the value of setgroups from their parents. - A proc file is used (instead of a sysctl) as sysctls currently do not allow checking the permissions at open time. - Writing to the proc file is restricted to before the gid_map for the user namespace is set. This ensures that disabling setgroups at a user namespace level will never remove the ability to call setgroups from a process that already has that ability. A process may opt in to the setgroups disable for itself by creating, entering and configuring a user namespace or by calling setns on an existing user namespace with setgroups disabled. Processes without privileges already can not call setgroups so this is a noop. Prodcess with privilege become processes without privilege when entering a user namespace and as with any other path to dropping privilege they would not have the ability to call setgroups. So this remains within the bounds of what is possible without a knob to disable setgroups permanently in a user namespace. Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- fs/proc/base.c | 53 +++++++++++++++++++++ include/linux/user_namespace.h | 7 +++ kernel/user.c | 1 + kernel/user_namespace.c | 85 ++++++++++++++++++++++++++++++++++ 4 files changed, 146 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index de12b8128b95..8fc784aef0b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2612,6 +2612,57 @@ static const struct file_operations proc_projid_map_operations = { .llseek = seq_lseek, .release = proc_id_map_release, }; + +static int proc_setgroups_open(struct inode *inode, struct file *file) +{ + struct user_namespace *ns = NULL; + struct task_struct *task; + int ret; + + ret = -ESRCH; + task = get_proc_task(inode); + if (task) { + rcu_read_lock(); + ns = get_user_ns(task_cred_xxx(task, user_ns)); + rcu_read_unlock(); + put_task_struct(task); + } + if (!ns) + goto err; + + if (file->f_mode & FMODE_WRITE) { + ret = -EACCES; + if (!ns_capable(ns, CAP_SYS_ADMIN)) + goto err_put_ns; + } + + ret = single_open(file, &proc_setgroups_show, ns); + if (ret) + goto err_put_ns; + + return 0; +err_put_ns: + put_user_ns(ns); +err: + return ret; +} + +static int proc_setgroups_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + int ret = single_release(inode, file); + put_user_ns(ns); + return ret; +} + +static const struct file_operations proc_setgroups_operations = { + .open = proc_setgroups_open, + .write = proc_setgroups_write, + .read = seq_read, + .llseek = seq_lseek, + .release = proc_setgroups_release, +}; #endif /* CONFIG_USER_NS */ static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, @@ -2720,6 +2771,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif #ifdef CONFIG_CHECKPOINT_RESTORE REG("timers", S_IRUGO, proc_timers_operations), @@ -3073,6 +3125,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("uid_map", S_IRUGO|S_IWUSR, proc_uid_map_operations), REG("gid_map", S_IRUGO|S_IWUSR, proc_gid_map_operations), REG("projid_map", S_IRUGO|S_IWUSR, proc_projid_map_operations), + REG("setgroups", S_IRUGO|S_IWUSR, proc_setgroups_operations), #endif }; diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 648cf88c5d6d..a37081cf59da 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -17,6 +17,10 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ } extent[UID_GID_MAP_MAX_EXTENTS]; }; +#define USERNS_SETGROUPS_ALLOWED 1UL + +#define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED + struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -27,6 +31,7 @@ struct user_namespace { kuid_t owner; kgid_t group; unsigned int proc_inum; + unsigned long flags; bool may_mount_sysfs; bool may_mount_proc; }; @@ -59,6 +64,8 @@ extern struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); +extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); +extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); #else diff --git a/kernel/user.c b/kernel/user.c index 69b4c3d48cde..6bbef5604101 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,7 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .flags = USERNS_INIT_FLAGS, .may_mount_sysfs = true, .may_mount_proc = true, }; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 78009bd5efe6..9da8cbbcd985 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -100,6 +100,11 @@ int create_user_ns(struct cred *new) ns->owner = owner; ns->group = group; + /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ + mutex_lock(&userns_state_mutex); + ns->flags = parent_ns->flags; + mutex_unlock(&userns_state_mutex); + set_cred_user_ns(new, ns); update_mnt_policy(ns); @@ -827,6 +832,84 @@ static bool new_idmap_permitted(const struct file *file, return false; } +int proc_setgroups_show(struct seq_file *seq, void *v) +{ + struct user_namespace *ns = seq->private; + unsigned long userns_flags = ACCESS_ONCE(ns->flags); + + seq_printf(seq, "%s\n", + (userns_flags & USERNS_SETGROUPS_ALLOWED) ? + "allow" : "deny"); + return 0; +} + +ssize_t proc_setgroups_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *seq = file->private_data; + struct user_namespace *ns = seq->private; + char kbuf[8], *pos; + bool setgroups_allowed; + ssize_t ret; + + /* Only allow a very narrow range of strings to be written */ + ret = -EINVAL; + if ((*ppos != 0) || (count >= sizeof(kbuf))) + goto out; + + /* What was written? */ + ret = -EFAULT; + if (copy_from_user(kbuf, buf, count)) + goto out; + kbuf[count] = '\0'; + pos = kbuf; + + /* What is being requested? */ + ret = -EINVAL; + if (strncmp(pos, "allow", 5) == 0) { + pos += 5; + setgroups_allowed = true; + } + else if (strncmp(pos, "deny", 4) == 0) { + pos += 4; + setgroups_allowed = false; + } + else + goto out; + + /* Verify there is not trailing junk on the line */ + pos = skip_spaces(pos); + if (*pos != '\0') + goto out; + + ret = -EPERM; + mutex_lock(&userns_state_mutex); + if (setgroups_allowed) { + /* Enabling setgroups after setgroups has been disabled + * is not allowed. + */ + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED)) + goto out_unlock; + } else { + /* Permanently disabling setgroups after setgroups has + * been enabled by writing the gid_map is not allowed. + */ + if (ns->gid_map.nr_extents != 0) + goto out_unlock; + ns->flags &= ~USERNS_SETGROUPS_ALLOWED; + } + mutex_unlock(&userns_state_mutex); + + /* Report a successful write */ + *ppos = count; + ret = count; +out: + return ret; +out_unlock: + mutex_unlock(&userns_state_mutex); + goto out; +} + bool userns_may_setgroups(const struct user_namespace *ns) { bool allowed; @@ -836,6 +919,8 @@ bool userns_may_setgroups(const struct user_namespace *ns) * the user namespace has been established. */ allowed = ns->gid_map.nr_extents != 0; + /* Is setgroups allowed? */ + allowed = allowed && (ns->flags & USERNS_SETGROUPS_ALLOWED); mutex_unlock(&userns_state_mutex); return allowed; From afb5285388ce7e7623ab9ddb08f287551e70426f Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Fri, 5 Dec 2014 19:36:04 -0600 Subject: [PATCH 29/39] userns: Allow setting gid_maps without privilege when setgroups is disabled commit 66d2f338ee4c449396b6f99f5e75cd18eb6df272 upstream. Now that setgroups can be disabled and not reenabled, setting gid_map without privielge can now be enabled when setgroups is disabled. This restores most of the functionality that was lost when unprivileged setting of gid_map was removed. Applications that use this functionality will need to check to see if they use setgroups or init_groups, and if they don't they can be fixed by simply disabling setgroups before writing to gid_map. Reviewed-by: Andy Lutomirski Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9da8cbbcd985..3f2fb33d291a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -814,6 +814,11 @@ static bool new_idmap_permitted(const struct file *file, kuid_t uid = make_kuid(ns->parent, id); if (uid_eq(uid, cred->euid)) return true; + } else if (cap_setid == CAP_SETGID) { + kgid_t gid = make_kgid(ns->parent, id); + if (!(ns->flags & USERNS_SETGROUPS_ALLOWED) && + gid_eq(gid, cred->egid)) + return true; } } From 3150a5ae1c08aac6aba8454cb536316a48eecc52 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 2 Dec 2014 13:56:30 -0600 Subject: [PATCH 30/39] userns: Unbreak the unprivileged remount tests commit db86da7cb76f797a1a8b445166a15cb922c6ff85 upstream. A security fix in caused the way the unprivileged remount tests were using user namespaces to break. Tweak the way user namespaces are being used so the test works again. Signed-off-by: "Eric W. Biederman" Signed-off-by: Greg Kroah-Hartman --- .../mount/unprivileged-remount-test.c | 32 ++++++++++++++----- 1 file changed, 24 insertions(+), 8 deletions(-) diff --git a/tools/testing/selftests/mount/unprivileged-remount-test.c b/tools/testing/selftests/mount/unprivileged-remount-test.c index 9669d375625a..517785052f1c 100644 --- a/tools/testing/selftests/mount/unprivileged-remount-test.c +++ b/tools/testing/selftests/mount/unprivileged-remount-test.c @@ -53,17 +53,14 @@ static void die(char *fmt, ...) exit(EXIT_FAILURE); } -static void write_file(char *filename, char *fmt, ...) +static void vmaybe_write_file(bool enoent_ok, char *filename, char *fmt, va_list ap) { char buf[4096]; int fd; ssize_t written; int buf_len; - va_list ap; - va_start(ap, fmt); buf_len = vsnprintf(buf, sizeof(buf), fmt, ap); - va_end(ap); if (buf_len < 0) { die("vsnprintf failed: %s\n", strerror(errno)); @@ -74,6 +71,8 @@ static void write_file(char *filename, char *fmt, ...) fd = open(filename, O_WRONLY); if (fd < 0) { + if ((errno == ENOENT) && enoent_ok) + return; die("open of %s failed: %s\n", filename, strerror(errno)); } @@ -92,6 +91,26 @@ static void write_file(char *filename, char *fmt, ...) } } +static void maybe_write_file(char *filename, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmaybe_write_file(true, filename, fmt, ap); + va_end(ap); + +} + +static void write_file(char *filename, char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vmaybe_write_file(false, filename, fmt, ap); + va_end(ap); + +} + static int read_mnt_flags(const char *path) { int ret; @@ -144,13 +163,10 @@ static void create_and_enter_userns(void) strerror(errno)); } + maybe_write_file("/proc/self/setgroups", "deny"); write_file("/proc/self/uid_map", "0 %d 1", uid); write_file("/proc/self/gid_map", "0 %d 1", gid); - if (setgroups(0, NULL) != 0) { - die("setgroups failed: %s\n", - strerror(errno)); - } if (setgid(0) != 0) { die ("setgid(0) failed %s\n", strerror(errno)); From 3d53f1f7166c4bc5e936155de143676bf5fc7b0c Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Fri, 19 Dec 2014 13:36:08 +0100 Subject: [PATCH 31/39] crypto: af_alg - fix backlog handling commit 7e77bdebff5cb1e9876c561f69710b9ab8fa1f7e upstream. If a request is backlogged, it's complete() handler will get called twice: once with -EINPROGRESS, and once with the final error code. af_alg's complete handler, unlike other users, does not handle the -EINPROGRESS but instead always completes the completion that recvmsg() is waiting on. This can lead to a return to user space while the request is still pending in the driver. If userspace closes the sockets before the requests are handled by the driver, this will lead to use-after-frees (and potential crashes) in the kernel due to the tfm having been freed. The crashes can be easily reproduced (for example) by reducing the max queue length in cryptod.c and running the following (from http://www.chronox.de/libkcapi.html) on AES-NI capable hardware: $ while true; do kcapi -x 1 -e -c '__ecb-aes-aesni' \ -k 00000000000000000000000000000000 \ -p 00000000000000000000000000000000 >/dev/null & done Signed-off-by: Rabin Vincent Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/af_alg.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crypto/af_alg.c b/crypto/af_alg.c index bf948e134981..6ef6e2ad344e 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -449,6 +449,9 @@ void af_alg_complete(struct crypto_async_request *req, int err) { struct af_alg_completion *completion = req->data; + if (err == -EINPROGRESS) + return; + completion->err = err; complete(&completion->completion); } From cb22aba0ee2bc330033b85447b4d44b1501ee75f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Dec 2014 15:52:22 -0800 Subject: [PATCH 32/39] ncpfs: return proper error from NCP_IOC_SETROOT ioctl commit a682e9c28cac152e6e54c39efcf046e0c8cfcf63 upstream. If some error happens in NCP_IOC_SETROOT ioctl, the appropriate error return value is then (in most cases) just overwritten before we return. This can result in reporting success to userspace although error happened. This bug was introduced by commit 2e54eb96e2c8 ("BKL: Remove BKL from ncpfs"). Propagate the errors correctly. Coverity id: 1226925. Fixes: 2e54eb96e2c80 ("BKL: Remove BKL from ncpfs") Signed-off-by: Jan Kara Cc: Petr Vandrovec Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/ncpfs/ioctl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 60426ccb3b65..2f970de02b16 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -448,7 +448,6 @@ static long __ncp_ioctl(struct inode *inode, unsigned int cmd, unsigned long arg result = -EIO; } } - result = 0; } mutex_unlock(&server->root_setup_lock); From 12279351a42d51638a21ede8a3f6c62aea26491e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 10 Dec 2014 15:55:25 -0800 Subject: [PATCH 33/39] exit: pidns: alloc_pid() leaks pid_namespace if child_reaper is exiting commit 24c037ebf5723d4d9ab0996433cee4f96c292a4d upstream. alloc_pid() does get_pid_ns() beforehand but forgets to put_pid_ns() if it fails because disable_pid_allocation() was called by the exiting child_reaper. We could simply move get_pid_ns() down to successful return, but this fix tries to be as trivial as possible. Signed-off-by: Oleg Nesterov Reviewed-by: "Eric W. Biederman" Cc: Aaron Tomlin Cc: Pavel Emelyanov Cc: Serge Hallyn Cc: Sterling Alexander Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/pid.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/pid.c b/kernel/pid.c index 0eb6d8e8b1da..3cdba5173600 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -335,6 +335,8 @@ out: out_unlock: spin_unlock_irq(&pidmap_lock); + put_pid_ns(ns); + out_free: while (++i <= ns->level) free_pidmap(pid->numbers + i); From 5fdeca1b0acd9790cdd5891a76bb6dfb707d2a1c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 19 Dec 2014 12:21:47 +0100 Subject: [PATCH 34/39] udf: Verify symlink size before loading it commit a1d47b262952a45aae62bd49cfaf33dd76c11a2c upstream. UDF specification allows arbitrarily large symlinks. However we support only symlinks at most one block large. Check the length of the symlink so that we don't access memory beyond end of the symlink block. Reported-by: Carl Henrik Lunde Signed-off-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/udf/symlink.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index d7c6dbe4194b..d89f324bc387 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -80,11 +80,17 @@ static int udf_symlink_filler(struct file *file, struct page *page) struct inode *inode = page->mapping->host; struct buffer_head *bh = NULL; unsigned char *symlink; - int err = -EIO; + int err; unsigned char *p = kmap(page); struct udf_inode_info *iinfo; uint32_t pos; + /* We don't support symlinks longer than one block */ + if (inode->i_size > inode->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto out_unmap; + } + iinfo = UDF_I(inode); pos = udf_block_map(inode, 0); @@ -94,8 +100,10 @@ static int udf_symlink_filler(struct file *file, struct page *page) } else { bh = sb_bread(inode->i_sb, pos); - if (!bh) - goto out; + if (!bh) { + err = -EIO; + goto out_unlock_inode; + } symlink = bh->b_data; } @@ -109,9 +117,10 @@ static int udf_symlink_filler(struct file *file, struct page *page) unlock_page(page); return 0; -out: +out_unlock_inode: up_read(&iinfo->i_data_sem); SetPageError(page); +out_unmap: kunmap(page); unlock_page(page); return err; From ed775f3161684770d506e150073d9f271335d5d3 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 7 Oct 2014 15:51:55 -0500 Subject: [PATCH 35/39] eCryptfs: Force RO mount when encrypted view is enabled commit 332b122d39c9cbff8b799007a825d94b2e7c12f2 upstream. The ecryptfs_encrypted_view mount option greatly changes the functionality of an eCryptfs mount. Instead of encrypting and decrypting lower files, it provides a unified view of the encrypted files in the lower filesystem. The presence of the ecryptfs_encrypted_view mount option is intended to force a read-only mount and modifying files is not supported when the feature is in use. See the following commit for more information: e77a56d [PATCH] eCryptfs: Encrypted passthrough This patch forces the mount to be read-only when the ecryptfs_encrypted_view mount option is specified by setting the MS_RDONLY flag on the superblock. Additionally, this patch removes some broken logic in ecryptfs_open() that attempted to prevent modifications of files when the encrypted view feature was in use. The check in ecryptfs_open() was not sufficient to prevent file modifications using system calls that do not operate on a file descriptor. Signed-off-by: Tyler Hicks Reported-by: Priya Bansal Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/file.c | 12 ------------ fs/ecryptfs/main.c | 16 +++++++++++++--- 2 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index a7abbea2c096..9ff3664bb3ea 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -196,23 +196,11 @@ static int ecryptfs_open(struct inode *inode, struct file *file) { int rc = 0; struct ecryptfs_crypt_stat *crypt_stat = NULL; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if ((mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) - && ((file->f_flags & O_WRONLY) || (file->f_flags & O_RDWR) - || (file->f_flags & O_CREAT) || (file->f_flags & O_TRUNC) - || (file->f_flags & O_APPEND))) { - printk(KERN_WARNING "Mount has encrypted view enabled; " - "files may only be read\n"); - rc = -EPERM; - goto out; - } /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); ecryptfs_set_file_private(file, file_info); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index e924cf45aad9..329a9cc2b2eb 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -494,6 +494,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags { struct super_block *s; struct ecryptfs_sb_info *sbi; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; @@ -512,6 +513,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags err = "Error parsing options"; goto out; } + mount_crypt_stat = &sbi->mount_crypt_stat; s = sget(fs_type, NULL, set_anon_super, flags, NULL); if (IS_ERR(s)) { @@ -558,11 +560,19 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags /** * Set the POSIX ACL flag based on whether they're enabled in the lower - * mount. Force a read-only eCryptfs mount if the lower mount is ro. - * Allow a ro eCryptfs mount even when the lower mount is rw. + * mount. */ s->s_flags = flags & ~MS_POSIXACL; - s->s_flags |= path.dentry->d_sb->s_flags & (MS_RDONLY | MS_POSIXACL); + s->s_flags |= path.dentry->d_sb->s_flags & MS_POSIXACL; + + /** + * Force a read-only eCryptfs mount when: + * 1) The lower mount is ro + * 2) The ecryptfs_encrypted_view mount option is specified + */ + if (path.dentry->d_sb->s_flags & MS_RDONLY || + mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + s->s_flags |= MS_RDONLY; s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; From 66012982c8e3344b6fc94defba2909356c607a6d Mon Sep 17 00:00:00 2001 From: Michael Halcrow Date: Wed, 26 Nov 2014 09:09:16 -0800 Subject: [PATCH 36/39] eCryptfs: Remove buggy and unnecessary write in file name decode routine commit 942080643bce061c3dd9d5718d3b745dcb39a8bc upstream. Dmitry Chernenkov used KASAN to discover that eCryptfs writes past the end of the allocated buffer during encrypted filename decoding. This fix corrects the issue by getting rid of the unnecessary 0 write when the current bit offset is 2. Signed-off-by: Michael Halcrow Reported-by: Dmitry Chernenkov Suggested-by: Kees Cook Signed-off-by: Tyler Hicks Signed-off-by: Greg Kroah-Hartman --- fs/ecryptfs/crypto.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f71ec125290d..1da2446bf6b0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2102,7 +2102,6 @@ ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, break; case 2: dst[dst_byte_offset++] |= (src_byte); - dst[dst_byte_offset] = 0; current_bit_offset = 0; break; } From d8bf9ec799b05f5a083e786894e276cdefea470a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 14 Nov 2014 16:16:30 -0500 Subject: [PATCH 37/39] Btrfs: do not move em to modified list when unpinning commit a28046956c71985046474283fa3bcd256915fb72 upstream. We use the modified list to keep track of which extents have been modified so we know which ones are candidates for logging at fsync() time. Newly modified extents are added to the list at modification time, around the same time the ordered extent is created. We do this so that we don't have to wait for ordered extents to complete before we know what we need to log. The problem is when something like this happens log extent 0-4k on inode 1 copy csum for 0-4k from ordered extent into log sync log commit transaction log some other extent on inode 1 ordered extent for 0-4k completes and adds itself onto modified list again log changed extents see ordered extent for 0-4k has already been logged at this point we assume the csum has been copied sync log crash On replay we will see the extent 0-4k in the log, drop the original 0-4k extent which is the same one that we are replaying which also drops the csum, and then we won't find the csum in the log for that bytenr. This of course causes us to have errors about not having csums for certain ranges of our inode. So remove the modified list manipulation in unpin_extent_cache, any modified extents should have been added well before now, and we don't want them re-logged. This fixes my test that I could reliably reproduce this problem with. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_map.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a4a7a1a8da95..0a3809500599 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -263,8 +263,6 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) - list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; From 5f55aee3b8880bca80c64872eadbaea1632bd4ac Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sun, 7 Dec 2014 21:31:47 +0000 Subject: [PATCH 38/39] Btrfs: fix fs corruption on transaction abort if device supports discard commit 678886bdc6378c1cbd5072da2c5a3035000214e3 upstream. When we abort a transaction we iterate over all the ranges marked as dirty in fs_info->freed_extents[0] and fs_info->freed_extents[1], clear them from those trees, add them back (unpin) to the free space caches and, if the fs was mounted with "-o discard", perform a discard on those regions. Also, after adding the regions to the free space caches, a fitrim ioctl call can see those ranges in a block group's free space cache and perform a discard on the ranges, so the same issue can happen without "-o discard" as well. This causes corruption, affecting one or multiple btree nodes (in the worst case leaving the fs unmountable) because some of those ranges (the ones in the fs_info->pinned_extents tree) correspond to btree nodes/leafs that are referred by the last committed super block - breaking the rule that anything that was committed by a transaction is untouched until the next transaction commits successfully. I ran into this while running in a loop (for several hours) the fstest that I recently submitted: [PATCH] fstests: add btrfs test to stress chunk allocation/removal and fstrim The corruption always happened when a transaction aborted and then fsck complained like this: _check_btrfs_filesystem: filesystem on /dev/sdc is inconsistent *** fsck.btrfs output *** Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 Check tree block failed, want=94945280, have=0 read block failed check_tree_block Couldn't open file system In this case 94945280 corresponded to the root of a tree. Using frace what I observed was the following sequence of steps happened: 1) transaction N started, fs_info->pinned_extents pointed to fs_info->freed_extents[0]; 2) node/eb 94945280 is created; 3) eb is persisted to disk; 4) transaction N commit starts, fs_info->pinned_extents now points to fs_info->freed_extents[1], and transaction N completes; 5) transaction N + 1 starts; 6) eb is COWed, and btrfs_free_tree_block() called for this eb; 7) eb range (94945280 to 94945280 + 16Kb) is added to fs_info->pinned_extents (fs_info->freed_extents[1]); 8) Something goes wrong in transaction N + 1, like hitting ENOSPC for example, and the transaction is aborted, turning the fs into readonly mode. The stack trace I got for example: [112065.253935] [] dump_stack+0x4d/0x66 [112065.254271] [] warn_slowpath_common+0x7f/0x98 [112065.254567] [] ? __btrfs_abort_transaction+0x50/0x10b [btrfs] [112065.261674] [] warn_slowpath_fmt+0x48/0x50 [112065.261922] [] ? btrfs_free_path+0x26/0x29 [btrfs] [112065.262211] [] __btrfs_abort_transaction+0x50/0x10b [btrfs] [112065.262545] [] btrfs_remove_chunk+0x537/0x58b [btrfs] [112065.262771] [] btrfs_delete_unused_bgs+0x1de/0x21b [btrfs] [112065.263105] [] cleaner_kthread+0x100/0x12f [btrfs] (...) [112065.264493] ---[ end trace dd7903a975a31a08 ]--- [112065.264673] BTRFS: error (device sdc) in btrfs_remove_chunk:2625: errno=-28 No space left [112065.264997] BTRFS info (device sdc): forced readonly 9) The clear kthread sees that the BTRFS_FS_STATE_ERROR bit is set in fs_info->fs_state and calls btrfs_cleanup_transaction(), which in turn calls btrfs_destroy_pinned_extent(); 10) Then btrfs_destroy_pinned_extent() iterates over all the ranges marked as dirty in fs_info->freed_extents[], and for each one it calls discard, if the fs was mounted with "-o discard", and adds the range to the free space cache of the respective block group; 11) btrfs_trim_block_group(), invoked from the fitrim ioctl code path, sees the free space entries and performs a discard; 12) After an umount and mount (or fsck), our eb's location on disk was full of zeroes, and it should have been untouched, because it was marked as dirty in the fs_info->pinned_extents tree, and therefore used by the trees that the last committed superblock points to. Fix this by not performing a discard and not adding the ranges to the free space caches - it's useless from this point since the fs is now in readonly mode and we won't write free space caches to disk anymore (otherwise we would leak space) nor any new superblock. By not adding the ranges to the free space caches, it prevents other code paths from allocating that space and write to it as well, therefore being safer and simpler. This isn't a new problem, as it's been present since 2011 (git commit acce952b0263825da32cf10489413dec78053347). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/disk-io.c | 6 ------ fs/btrfs/extent-tree.c | 10 ++++++---- 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index abecce399354..7360f03ddbe1 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3857,12 +3857,6 @@ again: if (ret) break; - /* opt_discard */ - if (btrfs_test_opt(root, DISCARD)) - ret = btrfs_error_discard_extent(root, start, - end + 1 - start, - NULL); - clear_extent_dirty(unpin, start, end, GFP_NOFS); btrfs_error_unpin_extent_range(root, start, end); cond_resched(); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index bbafa05519da..f99c71e40f8b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5277,7 +5277,8 @@ void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans, update_global_block_rsv(fs_info); } -static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) +static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end, + const bool return_free_space) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_block_group_cache *cache = NULL; @@ -5301,7 +5302,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) if (start < cache->last_byte_to_unpin) { len = min(len, cache->last_byte_to_unpin - start); - btrfs_add_free_space(cache, start, len); + if (return_free_space) + btrfs_add_free_space(cache, start, len); } start += len; @@ -5364,7 +5366,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, end + 1 - start, NULL); clear_extent_dirty(unpin, start, end, GFP_NOFS); - unpin_extent_range(root, start, end); + unpin_extent_range(root, start, end, true); cond_resched(); } @@ -8564,7 +8566,7 @@ out: int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end) { - return unpin_extent_range(root, start, end); + return unpin_extent_range(root, start, end, false); } int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr, From 46a490cc58e717f8058b878635c11535583840d2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 8 Jan 2015 09:58:30 -0800 Subject: [PATCH 39/39] Linux 3.10.64 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9383fe24baa9..e5b63fb3d0e1 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 3 PATCHLEVEL = 10 -SUBLEVEL = 63 +SUBLEVEL = 64 EXTRAVERSION = NAME = TOSSUG Baby Fish