From ef5d437f71afdf4afdbab99213add99f4b1318fd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 25 Oct 2012 13:37:31 -0700 Subject: [PATCH 01/18] mm: fix XFS oops due to dirty pages without buffers on s390 On s390 any write to a page (even from kernel itself) sets architecture specific page dirty bit. Thus when a page is written to via buffered write, HW dirty bit gets set and when we later map and unmap the page, page_remove_rmap() finds the dirty bit and calls set_page_dirty(). Dirtying of a page which shouldn't be dirty can cause all sorts of problems to filesystems. The bug we observed in practice is that buffers from the page get freed, so when the page gets later marked as dirty and writeback writes it, XFS crashes due to an assertion BUG_ON(!PagePrivate(page)) in page_buffers() called from xfs_count_page_state(). Similar problem can also happen when zero_user_segment() call from xfs_vm_writepage() (or block_write_full_page() for that matter) set the hardware dirty bit during writeback, later buffers get freed, and then page unmapped. Fix the issue by ignoring s390 HW dirty bit for page cache pages of mappings with mapping_cap_account_dirty(). This is safe because for such mappings when a page gets marked as writeable in PTE it is also marked dirty in do_wp_page() or do_page_fault(). When the dirty bit is cleared by clear_page_dirty_for_io(), the page gets writeprotected in page_mkclean(). So pagecache page is writeable if and only if it is dirty. Thanks to Hugh Dickins for pointing out mapping has to have mapping_cap_account_dirty() for things to work and proposing a cleaned up variant of the patch. The patch has survived about two hours of running fsx-linux on tmpfs while heavily swapping and several days of running on out build machines where the original problem was triggered. Signed-off-by: Jan Kara Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Hugh Dickins Cc: Heiko Carstens Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7df7984d476c..2ee1ef0f317b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -56,6 +56,7 @@ #include #include #include +#include #include @@ -926,11 +927,8 @@ int page_mkclean(struct page *page) if (page_mapped(page)) { struct address_space *mapping = page_mapping(page); - if (mapping) { + if (mapping) ret = page_mkclean_file(mapping, page); - if (page_test_and_clear_dirty(page_to_pfn(page), 1)) - ret = 1; - } } return ret; @@ -1116,6 +1114,7 @@ void page_add_file_rmap(struct page *page) */ void page_remove_rmap(struct page *page) { + struct address_space *mapping = page_mapping(page); bool anon = PageAnon(page); bool locked; unsigned long flags; @@ -1138,8 +1137,19 @@ void page_remove_rmap(struct page *page) * this if the page is anon, so about to be freed; but perhaps * not if it's in swapcache - there might be another pte slot * containing the swap entry, but page not yet written to swap. + * + * And we can skip it on file pages, so long as the filesystem + * participates in dirty tracking; but need to catch shm and tmpfs + * and ramfs pages which have been modified since creation by read + * fault. + * + * Note that mapping must be decided above, before decrementing + * mapcount (which luckily provides a barrier): once page is unmapped, + * it could be truncated and page->mapping reset to NULL at any moment. + * Note also that we are relying on page_mapping(page) to set mapping + * to &swapper_space when PageSwapCache(page). */ - if ((!anon || PageSwapCache(page)) && + if (mapping && !mapping_cap_account_dirty(mapping) && page_test_and_clear_dirty(page_to_pfn(page), 1)) set_page_dirty(page); /* From 8c9506d16925f1b1314d93af383ca3134eb534d8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Thu, 25 Oct 2012 13:37:34 -0700 Subject: [PATCH 02/18] cgroup: fix invalid rcu dereference Commit ad676077a2ae ("device_cgroup: convert device_cgroup internally to policy + exceptions") removed rcu locks which are needed in task_devcgroup called in this chain: devcgroup_inode_mknod OR __devcgroup_inode_permission -> __devcgroup_inode_permission -> task_devcgroup -> task_subsys_state -> task_subsys_state_check. Change the code so that task_devcgroup is safely called with rcu read lock held. =============================== [ INFO: suspicious RCU usage. ] 3.6.0-rc5-next-20120913+ #42 Not tainted ------------------------------- include/linux/cgroup.h:553 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 1, debug_locks = 0 2 locks held by kdevtmpfs/23: #0: (sb_writers){.+.+.+}, at: [] mnt_want_write+0x1f/0x50 #1: (&sb->s_type->i_mutex_key#3/1){+.+.+.}, at: [] kern_path_create+0x7f/0x170 stack backtrace: Pid: 23, comm: kdevtmpfs Not tainted 3.6.0-rc5-next-20120913+ #42 Call Trace: lockdep_rcu_suspicious+0xfd/0x130 devcgroup_inode_mknod+0x19d/0x240 vfs_mknod+0x71/0xf0 handle_create.isra.2+0x72/0x200 devtmpfsd+0x114/0x140 ? handle_create.isra.2+0x200/0x200 kthread+0xd6/0xe0 kernel_thread_helper+0x4/0x10 Signed-off-by: Jiri Slaby Cc: Dave Jones Cc: Tejun Heo Cc: Li Zefan Cc: James Morris Cc: Pavel Emelyanov Acked-by: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- security/device_cgroup.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 44dfc415a379..46d01fcc0d15 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -533,10 +533,10 @@ struct cgroup_subsys devices_subsys = { * * returns 0 on success, -EPERM case the operation is not permitted */ -static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup, - short type, u32 major, u32 minor, +static int __devcgroup_check_permission(short type, u32 major, u32 minor, short access) { + struct dev_cgroup *dev_cgroup; struct dev_exception_item ex; int rc; @@ -547,6 +547,7 @@ static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup, ex.access = access; rcu_read_lock(); + dev_cgroup = task_devcgroup(current); rc = may_access(dev_cgroup, &ex); rcu_read_unlock(); @@ -558,7 +559,6 @@ static int __devcgroup_check_permission(struct dev_cgroup *dev_cgroup, int __devcgroup_inode_permission(struct inode *inode, int mask) { - struct dev_cgroup *dev_cgroup = task_devcgroup(current); short type, access = 0; if (S_ISBLK(inode->i_mode)) @@ -570,13 +570,12 @@ int __devcgroup_inode_permission(struct inode *inode, int mask) if (mask & MAY_READ) access |= ACC_READ; - return __devcgroup_check_permission(dev_cgroup, type, imajor(inode), - iminor(inode), access); + return __devcgroup_check_permission(type, imajor(inode), iminor(inode), + access); } int devcgroup_inode_mknod(int mode, dev_t dev) { - struct dev_cgroup *dev_cgroup = task_devcgroup(current); short type; if (!S_ISBLK(mode) && !S_ISCHR(mode)) @@ -587,7 +586,7 @@ int devcgroup_inode_mknod(int mode, dev_t dev) else type = DEV_CHAR; - return __devcgroup_check_permission(dev_cgroup, type, MAJOR(dev), - MINOR(dev), ACC_MKNOD); + return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), + ACC_MKNOD); } From 5b7aa7d5bb2c5cf7fc05aaa41561af321706ab5f Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 25 Oct 2012 13:37:38 -0700 Subject: [PATCH 03/18] device_cgroup: rename deny_all to behavior This was done in a v2 patch but v1 ended up being committed. The variable name is less confusing and stores the default behavior when no matching exception exists. Signed-off-by: Aristeu Rozanski Cc: Dave Jones Cc: Tejun Heo Cc: Li Zefan Cc: James Morris Cc: Pavel Emelyanov Acked-by: Serge Hallyn Cc: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- security/device_cgroup.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 46d01fcc0d15..76503df23770 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -42,7 +42,10 @@ struct dev_exception_item { struct dev_cgroup { struct cgroup_subsys_state css; struct list_head exceptions; - bool deny_all; + enum { + DEVCG_DEFAULT_ALLOW, + DEVCG_DEFAULT_DENY, + } behavior; }; static inline struct dev_cgroup *css_to_devcgroup(struct cgroup_subsys_state *s) @@ -182,13 +185,13 @@ static struct cgroup_subsys_state *devcgroup_create(struct cgroup *cgroup) parent_cgroup = cgroup->parent; if (parent_cgroup == NULL) - dev_cgroup->deny_all = false; + dev_cgroup->behavior = DEVCG_DEFAULT_ALLOW; else { parent_dev_cgroup = cgroup_to_devcgroup(parent_cgroup); mutex_lock(&devcgroup_mutex); ret = dev_exceptions_copy(&dev_cgroup->exceptions, &parent_dev_cgroup->exceptions); - dev_cgroup->deny_all = parent_dev_cgroup->deny_all; + dev_cgroup->behavior = parent_dev_cgroup->behavior; mutex_unlock(&devcgroup_mutex); if (ret) { kfree(dev_cgroup); @@ -260,7 +263,7 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, * - List the exceptions in case the default policy is to deny * This way, the file remains as a "whitelist of devices" */ - if (devcgroup->deny_all == false) { + if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { set_access(acc, ACC_MASK); set_majmin(maj, ~0); set_majmin(min, ~0); @@ -314,12 +317,12 @@ static int may_access(struct dev_cgroup *dev_cgroup, * In two cases we'll consider this new exception valid: * - the dev cgroup has its default policy to allow + exception list: * the new exception should *not* match any of the exceptions - * (!deny_all, !match) + * (behavior == DEVCG_DEFAULT_ALLOW, !match) * - the dev cgroup has its default policy to deny + exception list: * the new exception *should* match the exceptions - * (deny_all, match) + * (behavior == DEVCG_DEFAULT_DENY, match) */ - if (dev_cgroup->deny_all == match) + if ((dev_cgroup->behavior == DEVCG_DEFAULT_DENY) == match) return 1; return 0; } @@ -375,11 +378,11 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, if (!parent_has_perm(devcgroup, &ex)) return -EPERM; dev_exception_clean(devcgroup); - devcgroup->deny_all = false; + devcgroup->behavior = DEVCG_DEFAULT_ALLOW; break; case DEVCG_DENY: dev_exception_clean(devcgroup); - devcgroup->deny_all = true; + devcgroup->behavior = DEVCG_DEFAULT_DENY; break; default: return -EINVAL; @@ -452,7 +455,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, * an matching exception instead. And be silent about it: we * don't want to break compatibility */ - if (devcgroup->deny_all == false) { + if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) { dev_exception_rm(devcgroup, &ex); return 0; } @@ -463,7 +466,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, * an matching exception instead. And be silent about it: we * don't want to break compatibility */ - if (devcgroup->deny_all == true) { + if (devcgroup->behavior == DEVCG_DEFAULT_DENY) { dev_exception_rm(devcgroup, &ex); return 0; } From 26fd8405dd470cb8b54cb96859b7dd437e5e1391 Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 25 Oct 2012 13:37:41 -0700 Subject: [PATCH 04/18] device_cgroup: stop using simple_strtoul() Convert the code to use kstrtou32() instead of simple_strtoul() which is deprecated. The real size of the variables are u32, so use kstrtou32 instead of kstrtoul Signed-off-by: Aristeu Rozanski Cc: Dave Jones Cc: Tejun Heo Cc: Li Zefan Cc: James Morris Cc: Pavel Emelyanov Acked-by: Serge Hallyn Cc: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- security/device_cgroup.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 76503df23770..4fbae8d0b36c 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -361,8 +361,8 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, int filetype, const char *buffer) { const char *b; - char *endp; - int count; + char temp[12]; /* 11 + 1 characters needed for a u32 */ + int count, rc; struct dev_exception_item ex; if (!capable(CAP_SYS_ADMIN)) @@ -405,8 +405,16 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, ex.major = ~0; b++; } else if (isdigit(*b)) { - ex.major = simple_strtoul(b, &endp, 10); - b = endp; + memset(temp, 0, sizeof(temp)); + for (count = 0; count < sizeof(temp) - 1; count++) { + temp[count] = *b; + b++; + if (!isdigit(*b)) + break; + } + rc = kstrtou32(temp, 10, &ex.major); + if (rc) + return -EINVAL; } else { return -EINVAL; } @@ -419,8 +427,16 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, ex.minor = ~0; b++; } else if (isdigit(*b)) { - ex.minor = simple_strtoul(b, &endp, 10); - b = endp; + memset(temp, 0, sizeof(temp)); + for (count = 0; count < sizeof(temp) - 1; count++) { + temp[count] = *b; + b++; + if (!isdigit(*b)) + break; + } + rc = kstrtou32(temp, 10, &ex.minor); + if (rc) + return -EINVAL; } else { return -EINVAL; } From 4cef7299b4786879a3e113e84084a72b24590c5b Mon Sep 17 00:00:00 2001 From: Aristeu Rozanski Date: Thu, 25 Oct 2012 13:37:45 -0700 Subject: [PATCH 05/18] device_cgroup: add proper checking when changing default behavior Before changing a group's default behavior to ALLOW, we must check if its parent's behavior is also ALLOW. Signed-off-by: Aristeu Rozanski Cc: Tejun Heo Cc: Li Zefan Cc: James Morris Cc: Pavel Emelyanov Acked-by: Serge Hallyn Cc: Jiri Slaby Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- security/device_cgroup.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 4fbae8d0b36c..842c254396db 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -344,6 +344,17 @@ static int parent_has_perm(struct dev_cgroup *childcg, return may_access(parent, ex); } +/** + * may_allow_all - checks if it's possible to change the behavior to + * allow based on parent's rules. + * @parent: device cgroup's parent + * returns: != 0 in case it's allowed, 0 otherwise + */ +static inline int may_allow_all(struct dev_cgroup *parent) +{ + return parent->behavior == DEVCG_DEFAULT_ALLOW; +} + /* * Modify the exception list using allow/deny rules. * CAP_SYS_ADMIN is needed for this. It's at least separate from CAP_MKNOD @@ -364,6 +375,8 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, char temp[12]; /* 11 + 1 characters needed for a u32 */ int count, rc; struct dev_exception_item ex; + struct cgroup *p = devcgroup->css.cgroup; + struct dev_cgroup *parent = cgroup_to_devcgroup(p->parent); if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -375,9 +388,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup, case 'a': switch (filetype) { case DEVCG_ALLOW: - if (!parent_has_perm(devcgroup, &ex)) + if (!may_allow_all(parent)) return -EPERM; dev_exception_clean(devcgroup); + rc = dev_exceptions_copy(&devcgroup->exceptions, + &parent->exceptions); + if (rc) + return rc; devcgroup->behavior = DEVCG_DEFAULT_ALLOW; break; case DEVCG_DENY: From fd6de5300e6fcad5621ef7e78eb227de9d4c8b79 Mon Sep 17 00:00:00 2001 From: Jingoo Han Date: Thu, 25 Oct 2012 13:37:48 -0700 Subject: [PATCH 06/18] backlight: ili9320: add missing SPI dependency Add this missing SPI dependency and prevent the driver from building without SPI, because functions of the spi driver are used in this driver. drivers/video/backlight/ili9320.c:51: undefined reference to `spi_sync' Also, a prompt string for CONFIG_LCD_ILI9320 is added for explicit selection. Signed-off-by: Jingoo Han Cc: Richard Purdie Cc: Ben Dooks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/video/backlight/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index c101697a4ba7..765a945f8ea1 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -60,7 +60,8 @@ config LCD_LTV350QV The LTV350QV panel is present on all ATSTK1000 boards. config LCD_ILI9320 - tristate + tristate "ILI Technology ILI9320 controller support" + depends on SPI help If you have a panel based on the ILI9320 controller chip then say y to include a power driver for it. From eedce141cd2dad8d0cefc5468ef41898949a7031 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 25 Oct 2012 13:37:51 -0700 Subject: [PATCH 07/18] genalloc: stop crashing the system when destroying a pool The genalloc code uses the bitmap API from include/linux/bitmap.h and lib/bitmap.c, which is based on long values. Both bitmap_set from lib/bitmap.c and bitmap_set_ll, which is the lockless version from genalloc.c, use BITMAP_LAST_WORD_MASK to set the first bits in a long in the bitmap. That one uses (1 << bits) - 1, 0b111, if you are setting the first three bits. This means that the API counts from the least significant bits (LSB from now on) to the MSB. The LSB in the first long is bit 0, then. The same works for the lookup functions. The genalloc code uses longs for the bitmap, as it should. In include/linux/genalloc.h, struct gen_pool_chunk has unsigned long bits[0] as its last member. When allocating the struct, genalloc should reserve enough space for the bitmap. This should be a proper number of longs that can fit the amount of bits in the bitmap. However, genalloc allocates an integer number of bytes that fit the amount of bits, but may not be an integer amount of longs. 9 bytes, for example, could be allocated for 70 bits. This is a problem in itself if the Least Significat Bit in a long is in the byte with the largest address, which happens in Big Endian machines. This means genalloc is not allocating the byte in which it will try to set or check for a bit. This may end up in memory corruption, where genalloc will try to set the bits it has not allocated. In fact, genalloc may not set these bits because it may find them already set, because they were not zeroed since they were not allocated. And that's what causes a BUG when gen_pool_destroy is called and check for any set bits. What really happens is that genalloc uses kmalloc_node with __GFP_ZERO on gen_pool_add_virt. With SLAB and SLUB, this means the whole slab will be cleared, not only the requested bytes. Since struct gen_pool_chunk has a size that is a multiple of 8, and slab sizes are multiples of 8, we get lucky and allocate and clear the right amount of bytes. Hower, this is not the case with SLOB or with older code that did memset after allocating instead of using __GFP_ZERO. So, a simple module as this (running 3.6.0), will cause a crash when rmmod'ed. [root@phantom-lp2 foo]# cat foo.c #include #include #include #include MODULE_LICENSE("GPL"); MODULE_VERSION("0.1"); static struct gen_pool *foo_pool; static __init int foo_init(void) { int ret; foo_pool = gen_pool_create(10, -1); if (!foo_pool) return -ENOMEM; ret = gen_pool_add(foo_pool, 0xa0000000, 32 << 10, -1); if (ret) { gen_pool_destroy(foo_pool); return ret; } return 0; } static __exit void foo_exit(void) { gen_pool_destroy(foo_pool); } module_init(foo_init); module_exit(foo_exit); [root@phantom-lp2 foo]# zcat /proc/config.gz | grep SLOB CONFIG_SLOB=y [root@phantom-lp2 foo]# insmod ./foo.ko [root@phantom-lp2 foo]# rmmod foo ------------[ cut here ]------------ kernel BUG at lib/genalloc.c:243! cpu 0x4: Vector: 700 (Program Check) at [c0000000bb0e7960] pc: c0000000003cb50c: .gen_pool_destroy+0xac/0x110 lr: c0000000003cb4fc: .gen_pool_destroy+0x9c/0x110 sp: c0000000bb0e7be0 msr: 8000000000029032 current = 0xc0000000bb0e0000 paca = 0xc000000006d30e00 softe: 0 irq_happened: 0x01 pid = 13044, comm = rmmod kernel BUG at lib/genalloc.c:243! [c0000000bb0e7ca0] d000000004b00020 .foo_exit+0x20/0x38 [foo] [c0000000bb0e7d20] c0000000000dff98 .SyS_delete_module+0x1a8/0x290 [c0000000bb0e7e30] c0000000000097d4 syscall_exit+0x0/0x94 --- Exception: c00 (System Call) at 000000800753d1a0 SP (fffd0b0e640) is in userspace Signed-off-by: Thadeu Lima de Souza Cascardo Cc: Paul Gortmaker Cc: Benjamin Gaignard Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/genalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/genalloc.c b/lib/genalloc.c index ca208a92628c..54920433705a 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -178,7 +178,7 @@ int gen_pool_add_virt(struct gen_pool *pool, unsigned long virt, phys_addr_t phy struct gen_pool_chunk *chunk; int nbits = size >> pool->min_alloc_order; int nbytes = sizeof(struct gen_pool_chunk) + - (nbits + BITS_PER_BYTE - 1) / BITS_PER_BYTE; + BITS_TO_LONGS(nbits) * sizeof(long); chunk = kmalloc_node(nbytes, GFP_KERNEL | __GFP_ZERO, nid); if (unlikely(chunk == NULL)) From 29fc7c5a4f516d388fb6e1f6d24bfb04b8093e54 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 25 Oct 2012 13:37:53 -0700 Subject: [PATCH 08/18] rbtree: include linux/compiler.h for definition of __always_inline rb_erase_augmented() is a static function annotated with __always_inline. This causes a compile failure when attempting to use the rbtree implementation as a library (e.g. kvm tool): rbtree_augmented.h:125:24: error: expected `=', `,', `;', `asm' or `__attribute__' before `void' Include linux/compiler.h in rbtree_augmented.h so that the __always_inline macro is resolved correctly. Signed-off-by: Will Deacon Cc: Pekka Enberg Reviewed-by: Michel Lespinasse Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rbtree_augmented.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/rbtree_augmented.h b/include/linux/rbtree_augmented.h index 214caa33433b..2ac60c9cf644 100644 --- a/include/linux/rbtree_augmented.h +++ b/include/linux/rbtree_augmented.h @@ -24,6 +24,7 @@ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H +#include #include /* From 86a595f961360c9c31d00fb111dd09e5d5ba9a40 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 25 Oct 2012 13:37:56 -0700 Subject: [PATCH 09/18] mm/page_alloc.c:alloc_contig_range(): return early for err path If start_isolate_page_range() failed, unset_migratetype_isolate() has been done inside it. Signed-off-by: Bob Liu Cc: Ni zhan Chen Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb90971182bd..b0012ab372a4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5825,7 +5825,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, ret = start_isolate_page_range(pfn_max_align_down(start), pfn_max_align_up(end), migratetype); if (ret) - goto done; + return ret; ret = __alloc_contig_migrate_range(&cc, start, end); if (ret) From 59ce8764bdfe8f3c6c02d3215741584dbb43409d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 25 Oct 2012 13:37:57 -0700 Subject: [PATCH 10/18] UAPI: fix tools/vm/page-types.c Fix tools/vm/page-types.c to use the UAPI variant of linux/kernel-page-flags.h lest the following error appear: In file included from page-types.c:38:0: ../../include/linux/kernel-page-flags.h:4:42: fatal error: uapi/linux/kernel-page-flags.h: No such file or directory Reported-by: Daniel Hazelton Signed-off-by: David Howells Reviewed-by: Fengguang Wu Tested-by: Daniel Hazelton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index cd1b03e80899..b76edf2f8333 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -35,7 +35,7 @@ #include #include #include "../../include/uapi/linux/magic.h" -#include "../../include/linux/kernel-page-flags.h" +#include "../../include/uapi/linux/kernel-page-flags.h" #ifndef MAX_PATH From fc314d0a4a933603f521de343634910a4ed9b37b Mon Sep 17 00:00:00 2001 From: Daniel Hazelton Date: Thu, 25 Oct 2012 13:37:59 -0700 Subject: [PATCH 11/18] tools/testing/selftests/epoll/test_epoll.c: fix build Latest Linus head run of "make selftests" in the tools directory failed with references to undefined variables. Reference was to 'write_thread_data' which is the name of a struct that is being used, not the variable itself. Change reference so it points to the variable. Signed-off-by: Daniel Hazelton Cc: "Paton J. Lewis" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/epoll/test_epoll.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/epoll/test_epoll.c b/tools/testing/selftests/epoll/test_epoll.c index e0fcff1e8331..f7525392ce84 100644 --- a/tools/testing/selftests/epoll/test_epoll.c +++ b/tools/testing/selftests/epoll/test_epoll.c @@ -162,14 +162,14 @@ void *write_thread_function(void *function_data) int index; struct write_thread_data *thread_data = (struct write_thread_data *)function_data; - while (!write_thread_data->stop) + while (!thread_data->stop) for (index = 0; !thread_data->stop && (index < thread_data->n_fds); ++index) if ((write(thread_data->fds[index], &data, 1) < 1) && (errno != EAGAIN) && (errno != EWOULDBLOCK)) { - write_thread_data->status = errno; + thread_data->status = errno; return; } } From 35cfa2b0b491c37e23527822bf365610dbb188e5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 25 Oct 2012 13:38:01 -0700 Subject: [PATCH 12/18] mm/mmu_notifier: allocate mmu_notifier in advance While allocating mmu_notifier with parameter GFP_KERNEL, swap would start to work in case of tight available memory. Eventually, that would lead to a deadlock while the swap deamon swaps anonymous pages. It was caused by commit e0f3c3f78da29b ("mm/mmu_notifier: init notifier if necessary"). ================================= [ INFO: inconsistent lock state ] 3.7.0-rc1+ #518 Not tainted --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/35 [HC0[0]:SC0[0]:HE1:SE1] takes: (&mapping->i_mmap_mutex){+.+.?.}, at: page_referenced+0x9c/0x2e0 {RECLAIM_FS-ON-W} state was registered at: mark_held_locks+0x86/0x150 lockdep_trace_alloc+0x67/0xc0 kmem_cache_alloc_trace+0x33/0x230 do_mmu_notifier_register+0x87/0x180 mmu_notifier_register+0x13/0x20 kvm_dev_ioctl+0x428/0x510 do_vfs_ioctl+0x98/0x570 sys_ioctl+0x91/0xb0 system_call_fastpath+0x16/0x1b irq event stamp: 825 hardirqs last enabled at (825): _raw_spin_unlock_irq+0x30/0x60 hardirqs last disabled at (824): _raw_spin_lock_irq+0x19/0x80 softirqs last enabled at (0): copy_process+0x630/0x17c0 softirqs last disabled at (0): (null) ... Simply back out the above commit, which was a small performance optimization. Signed-off-by: Gavin Shan Reported-by: Andrea Righi Tested-by: Andrea Righi Cc: Wanpeng Li Cc: Andrea Arcangeli Cc: Avi Kivity Cc: Hugh Dickins Cc: Marcelo Tosatti Cc: Xiao Guangrong Cc: Sagi Grimberg Cc: Haggai Eran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmu_notifier.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 479a1e751a73..8a5ac8c686b0 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -196,28 +196,28 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, BUG_ON(atomic_read(&mm->mm_users) <= 0); /* - * Verify that mmu_notifier_init() already run and the global srcu is - * initialized. - */ + * Verify that mmu_notifier_init() already run and the global srcu is + * initialized. + */ BUG_ON(!srcu.per_cpu_ref); + ret = -ENOMEM; + mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), GFP_KERNEL); + if (unlikely(!mmu_notifier_mm)) + goto out; + if (take_mmap_sem) down_write(&mm->mmap_sem); ret = mm_take_all_locks(mm); if (unlikely(ret)) - goto out; + goto out_clean; if (!mm_has_notifiers(mm)) { - mmu_notifier_mm = kmalloc(sizeof(struct mmu_notifier_mm), - GFP_KERNEL); - if (unlikely(!mmu_notifier_mm)) { - ret = -ENOMEM; - goto out_of_mem; - } INIT_HLIST_HEAD(&mmu_notifier_mm->list); spin_lock_init(&mmu_notifier_mm->lock); mm->mmu_notifier_mm = mmu_notifier_mm; + mmu_notifier_mm = NULL; } atomic_inc(&mm->mm_count); @@ -233,12 +233,12 @@ static int do_mmu_notifier_register(struct mmu_notifier *mn, hlist_add_head(&mn->hlist, &mm->mmu_notifier_mm->list); spin_unlock(&mm->mmu_notifier_mm->lock); -out_of_mem: mm_drop_all_locks(mm); -out: +out_clean: if (take_mmap_sem) up_write(&mm->mmap_sem); - + kfree(mmu_notifier_mm); +out: BUG_ON(atomic_read(&mm->mm_users) <= 0); return ret; } From d5ea7b5ec1ee4dac868143806c0bd94855754677 Mon Sep 17 00:00:00 2001 From: Hein Tibosch Date: Thu, 25 Oct 2012 13:38:05 -0700 Subject: [PATCH 13/18] drivers/dma/dw_dmac: make driver's endianness configurable The dw_dmac driver was originally developed for avr32 to be used with the Synopsys DesignWare AHB DMA controller. Starting from 2.6.38, access to the device's i/o memory was done with the little-endian readl/writel functions(1) This broke the driver for the avr32 platform, because it needs big (native) endian accessors. This patch makes the endianness configurable using 'DW_DMAC_BIG_ENDIAN_IO', which will default be true for AVR32 I submitted this patch before(2) but then waited for Andy to finish other changes to the same module(3). (1) https://patchwork.kernel.org/patch/608211 (2) https://lkml.org/lkml/2012/8/26/148 (3) https://lkml.org/lkml/2012/9/21/173 Signed-off-by: Hein Tibosch Acked-by: Arnd Bergmann Cc: Andy Shevchenko Acked-by: Viresh Kumar Cc: Hans-Christian Egtvedt Cc: Ludovic Desroches Cc: Havard Skinnemoen Cc: Nicolas Ferre Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/dma/Kconfig | 11 +++++++++++ drivers/dma/dw_dmac_regs.h | 18 +++++++++++++----- 2 files changed, 24 insertions(+), 5 deletions(-) diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 677cd6e4e1a1..d4c12180c654 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -90,6 +90,17 @@ config DW_DMAC Support the Synopsys DesignWare AHB DMA controller. This can be integrated in chips such as the Atmel AT32ap7000. +config DW_DMAC_BIG_ENDIAN_IO + bool "Use big endian I/O register access" + default y if AVR32 + depends on DW_DMAC + help + Say yes here to use big endian I/O access when reading and writing + to the DMA controller registers. This is needed on some platforms, + like the Atmel AVR32 architecture. + + If unsure, use the default setting. + config AT_HDMAC tristate "Atmel AHB DMA support" depends on ARCH_AT91 diff --git a/drivers/dma/dw_dmac_regs.h b/drivers/dma/dw_dmac_regs.h index ff39fa6cd2bc..88965597b7d0 100644 --- a/drivers/dma/dw_dmac_regs.h +++ b/drivers/dma/dw_dmac_regs.h @@ -98,9 +98,17 @@ struct dw_dma_regs { u32 DW_PARAMS; }; +#ifdef CONFIG_DW_DMAC_BIG_ENDIAN_IO +#define dma_readl_native ioread32be +#define dma_writel_native iowrite32be +#else +#define dma_readl_native readl +#define dma_writel_native writel +#endif + /* To access the registers in early stage of probe */ #define dma_read_byaddr(addr, name) \ - readl((addr) + offsetof(struct dw_dma_regs, name)) + dma_readl_native((addr) + offsetof(struct dw_dma_regs, name)) /* Bitfields in DW_PARAMS */ #define DW_PARAMS_NR_CHAN 8 /* number of channels */ @@ -216,9 +224,9 @@ __dwc_regs(struct dw_dma_chan *dwc) } #define channel_readl(dwc, name) \ - readl(&(__dwc_regs(dwc)->name)) + dma_readl_native(&(__dwc_regs(dwc)->name)) #define channel_writel(dwc, name, val) \ - writel((val), &(__dwc_regs(dwc)->name)) + dma_writel_native((val), &(__dwc_regs(dwc)->name)) static inline struct dw_dma_chan *to_dw_dma_chan(struct dma_chan *chan) { @@ -246,9 +254,9 @@ static inline struct dw_dma_regs __iomem *__dw_regs(struct dw_dma *dw) } #define dma_readl(dw, name) \ - readl(&(__dw_regs(dw)->name)) + dma_readl_native(&(__dw_regs(dw)->name)) #define dma_writel(dw, name, val) \ - writel((val), &(__dw_regs(dw)->name)) + dma_writel_native((val), &(__dw_regs(dw)->name)) #define channel_set_bit(dw, reg, mask) \ dma_writel(dw, reg, ((mask) << 8) | (mask)) From f2302505775fd13ba93f034206f1e2a587017929 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Thu, 25 Oct 2012 13:38:07 -0700 Subject: [PATCH 14/18] pidns: limit the nesting depth of pid namespaces 'struct pid' is a "variable sized struct" - a header with an array of upids at the end. The size of the array depends on a level (depth) of pid namespaces. Now a level of pidns is not limited, so 'struct pid' can be more than one page. Looks reasonable, that it should be less than a page. MAX_PIS_NS_LEVEL is not calculated from PAGE_SIZE, because in this case it depends on architectures, config options and it will be reduced, if someone adds a new fields in struct pid or struct upid. I suggest to set MAX_PIS_NS_LEVEL = 32, because it saves ability to expand "struct pid" and it's more than enough for all known for me use-cases. When someone finds a reasonable use case, we can add a config option or a sysctl parameter. In addition it will reduce the effect of another problem, when we have many nested namespaces and the oldest one starts dying. zap_pid_ns_processe will be called for each namespace and find_vpid will be called for each process in a namespace. find_vpid will be called minimum max_level^2 / 2 times. The reason of that is that when we found a bit in pidmap, we can't determine this pidns is top for this process or it isn't. vpid is a heavy operation, so a fork bomb, which create many nested namespace, can make a system inaccessible for a long time. For example my system becomes inaccessible for a few minutes with 4000 processes. [akpm@linux-foundation.org: return -EINVAL in response to excessive nesting, not -ENOMEM] Signed-off-by: Andrew Vagin Acked-by: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index eb00be205811..7b07cc0dfb75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -71,12 +71,22 @@ err_alloc: return NULL; } +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; + int i; + int err; + if (level > MAX_PID_NS_LEVEL) { + err = -EINVAL; + goto out; + } + + err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; From 6b187d0260b6cd1d0904309f32659b7ed5948af8 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 25 Oct 2012 13:38:08 -0700 Subject: [PATCH 15/18] mm, numa: avoid setting zone_reclaim_mode unless a node is sufficiently distant Commit 957f822a0ab9 ("mm, numa: reclaim from all nodes within reclaim distance") caused zone_reclaim_mode to be set for all systems where two nodes are within RECLAIM_DISTANCE of each other. This is the opposite of what we actually want: zone_reclaim_mode should be set if two nodes are sufficiently distant. Signed-off-by: David Rientjes Reported-by: Julian Wollrath Tested-by: Julian Wollrath Cc: Hugh Dickins Cc: Patrik Kullman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b0012ab372a4..5b74de6702e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,10 +1809,10 @@ static void __paginginit init_zone_allows_reclaim(int nid) int i; for_each_online_node(i) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) { + if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); + else zone_reclaim_mode = 1; - } } #else /* CONFIG_NUMA */ From fee0de7791f967c2c5f0d43eb7b7261761b45e64 Mon Sep 17 00:00:00 2001 From: Jan Luebbe Date: Thu, 25 Oct 2012 13:38:11 -0700 Subject: [PATCH 16/18] drivers/rtc/rtc-imxdi.c: add missing spin lock initialization Signed-off-by: Jan Luebbe Cc: Alessandro Zummo Cc: Roland Stigge Cc: Grant Likely Tested-by: Roland Stigge Cc: Sascha Hauer Cc: Russell King Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/rtc/rtc-imxdi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/rtc/rtc-imxdi.c b/drivers/rtc/rtc-imxdi.c index 891cd6c61d0a..4eed51044c5d 100644 --- a/drivers/rtc/rtc-imxdi.c +++ b/drivers/rtc/rtc-imxdi.c @@ -392,6 +392,8 @@ static int dryice_rtc_probe(struct platform_device *pdev) if (imxdi->ioaddr == NULL) return -ENOMEM; + spin_lock_init(&imxdi->irq_lock); + imxdi->irq = platform_get_irq(pdev, 0); if (imxdi->irq < 0) return imxdi->irq; From 20f1de659b77364d55d4e7fad2ef657e7730323f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 25 Oct 2012 13:38:14 -0700 Subject: [PATCH 17/18] gen_init_cpio: avoid stack overflow when expanding Fix possible overflow of the buffer used for expanding environment variables when building file list. In the extremely unlikely case of an attacker having control over the environment variables visible to gen_init_cpio, control over the contents of the file gen_init_cpio parses, and gen_init_cpio was built without compiler hardening, the attacker can gain arbitrary execution control via a stack buffer overflow. $ cat usr/crash.list file foo ${BIG}${BIG}${BIG}${BIG}${BIG}${BIG} 0755 0 0 $ BIG=$(perl -e 'print "A" x 4096;') ./usr/gen_init_cpio usr/crash.list *** buffer overflow detected ***: ./usr/gen_init_cpio terminated This also replaces the space-indenting with tabs. Patch based on existing fix extracted from grsecurity. Signed-off-by: Kees Cook Cc: Michal Marek Cc: Brad Spengler Cc: PaX Team Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- usr/gen_init_cpio.c | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/usr/gen_init_cpio.c b/usr/gen_init_cpio.c index af0f22fb1ef7..aca6edcbbc6f 100644 --- a/usr/gen_init_cpio.c +++ b/usr/gen_init_cpio.c @@ -303,7 +303,7 @@ static int cpio_mkfile(const char *name, const char *location, int retval; int rc = -1; int namesize; - int i; + unsigned int i; mode |= S_IFREG; @@ -381,25 +381,28 @@ error: static char *cpio_replace_env(char *new_location) { - char expanded[PATH_MAX + 1]; - char env_var[PATH_MAX + 1]; - char *start; - char *end; + char expanded[PATH_MAX + 1]; + char env_var[PATH_MAX + 1]; + char *start; + char *end; - for (start = NULL; (start = strstr(new_location, "${")); ) { - end = strchr(start, '}'); - if (start < end) { - *env_var = *expanded = '\0'; - strncat(env_var, start + 2, end - start - 2); - strncat(expanded, new_location, start - new_location); - strncat(expanded, getenv(env_var), PATH_MAX); - strncat(expanded, end + 1, PATH_MAX); - strncpy(new_location, expanded, PATH_MAX); - } else - break; - } + for (start = NULL; (start = strstr(new_location, "${")); ) { + end = strchr(start, '}'); + if (start < end) { + *env_var = *expanded = '\0'; + strncat(env_var, start + 2, end - start - 2); + strncat(expanded, new_location, start - new_location); + strncat(expanded, getenv(env_var), + PATH_MAX - strlen(expanded)); + strncat(expanded, end + 1, + PATH_MAX - strlen(expanded)); + strncpy(new_location, expanded, PATH_MAX); + new_location[PATH_MAX] = 0; + } else + break; + } - return new_location; + return new_location; } From 12176503366885edd542389eed3aaf94be163fdb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 25 Oct 2012 13:38:16 -0700 Subject: [PATCH 18/18] fs/compat_ioctl.c: VIDEO_SET_SPU_PALETTE missing error check The compat ioctl for VIDEO_SET_SPU_PALETTE was missing an error check while converting ioctl arguments. This could lead to leaking kernel stack contents into userspace. Patch extracted from existing fix in grsecurity. Signed-off-by: Kees Cook Cc: David Miller Cc: Brad Spengler Cc: PaX Team Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f5054025f9da..4c6285fff598 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -210,6 +210,8 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, err = get_user(palp, &up->palette); err |= get_user(length, &up->length); + if (err) + return -EFAULT; up_native = compat_alloc_user_space(sizeof(struct video_spu_palette)); err = put_user(compat_ptr(palp), &up_native->palette);