From c225aa57ff4ffe715df4692676b77c815a337236 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Simon=20Holm=20Th=C3=B8gersen?= Date: Sun, 11 Jan 2009 22:34:01 -0500 Subject: [PATCH 01/60] ext4: fix wrong use of do_div MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit the following warning: fs/jbd2/journal.c: In function ‘jbd2_seq_info_show’: fs/jbd2/journal.c:850: warning: format ‘%lu’ expects type ‘long unsigned int’, but argument 3 has type ‘uint32_t’ is caused by wrong usage of do_div that modifies the dividend in-place and returns the quotient. So not only would an incorrect value be displayed, but s->journal->j_average_commit_time would also be changed to a wrong value! Fix it by using div_u64 instead. Signed-off-by: Simon Holm Thøgersen Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 56675306ed81..eb343008eded 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -37,10 +37,10 @@ #include #include #include +#include #include #include -#include EXPORT_SYMBOL(jbd2_journal_start); EXPORT_SYMBOL(jbd2_journal_restart); @@ -846,8 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); - seq_printf(seq, " %luus average transaction commit time\n", - do_div(s->journal->j_average_commit_time, 1000)); + seq_printf(seq, " %lluus average transaction commit time\n", + div_u64(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", s->stats->u.run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", From 06a279d636734da32bb62dd2f7b0ade666f65d7c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 17 Jan 2009 18:41:37 -0500 Subject: [PATCH 02/60] ext4: only use i_size_high for regular files Directories are not allowed to be bigger than 2GB, so don't use i_size_high for anything other than regular files. E2fsck should complain about these inodes, but the simplest thing to do for the kernel is to only use i_size_high for regular files. This prevents an intentially corrupted filesystem from causing the kernel to burn a huge amount of CPU and issuing error messages such as: EXT4-fs warning (device loop0): ext4_block_to_path: block 135090028 > max Thanks to David Maciejak from Fortinet's FortiGuard Global Security Research Team for reporting this issue. http://bugzilla.kernel.org/show_bug.cgi?id=12375 Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/ext4.h | 7 +++++-- fs/ext4/inode.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c668e4377d76..aafc9eba1c25 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1206,8 +1206,11 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, static inline loff_t ext4_isize(struct ext4_inode *raw_inode) { - return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | - le32_to_cpu(raw_inode->i_size_lo); + if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); + else + return (loff_t) le32_to_cpu(raw_inode->i_size_lo); } static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a6444cee0c7e..49484ba801c9 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -360,9 +360,9 @@ static int ext4_block_to_path(struct inode *inode, final = ptrs; } else { ext4_warning(inode->i_sb, "ext4_block_to_path", - "block %lu > max", + "block %lu > max in inode %lu", i_block + direct_blocks + - indirect_blocks + double_blocks); + indirect_blocks + double_blocks, inode->i_ino); } if (boundary) *boundary = final - 1 - (i_block & (ptrs - 1)); From e6b8bc09ba2075cd91fbffefcd2778b1a00bd76f Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 16 Jan 2009 11:13:40 -0500 Subject: [PATCH 03/60] ext4: Add sanity check to make_indexed_dir Make sure the rec_len field in the '..' entry is sane, lest we overrun the directory block and cause a kernel oops on a purposefully corrupted filesystem. Thanks to Sami Liedes for reporting this bug. http://bugzilla.kernel.org/show_bug.cgi?id=12430 Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/namei.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index fec0b4c2f5f1..ba702bd7910d 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1368,7 +1368,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk(KERN_DEBUG "Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -1377,6 +1377,20 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext4_dir_entry_2 *)((char *)fde + + ext4_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext4_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + + /* Allocate new block for the 0th block's dirents */ bh2 = ext4_append(handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); @@ -1385,11 +1399,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, EXT4_I(dir)->i_flags |= EXT4_INDEX_FL; data1 = bh2->b_data; - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext4_dir_entry_2 *)((char *)fde + - ext4_rec_len_from_disk(fde->rec_len)); - len = ((char *) root) + blocksize - (char *) de; memcpy (data1, de, len); de = (struct ext4_dir_entry_2 *) data1; top = data1 + len; From a21102b55c4f8dfd3adb4a15a34cd62237b46039 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 16 Jan 2009 11:13:47 -0500 Subject: [PATCH 04/60] ext3: Add sanity check to make_indexed_dir Make sure the rec_len field in the '..' entry is sane, lest we overrun the directory block and cause a kernel oops on a purposefully corrupted filesystem. This fixes a bug related to a bug originally reported by Sami Liedes for ext4 at: http://bugzilla.kernel.org/show_bug.cgi?id=12430 Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext3/namei.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 69a3d19ca9fd..4db4ffa1edad 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1358,7 +1358,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index: inode %lu\n", dir->i_ino)); retval = ext3_journal_get_write_access(handle, bh); if (retval) { ext3_std_error(dir->i_sb, retval); @@ -1367,6 +1367,19 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; + /* The 0th block becomes the root, move the dirents out */ + fde = &root->dotdot; + de = (struct ext3_dir_entry_2 *)((char *)fde + + ext3_rec_len_from_disk(fde->rec_len)); + if ((char *) de >= (((char *) root) + blocksize)) { + ext3_error(dir->i_sb, __func__, + "invalid rec_len for '..' in inode %lu", + dir->i_ino); + brelse(bh); + return -EIO; + } + len = ((char *) root) + blocksize - (char *) de; + bh2 = ext3_append (handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); @@ -1375,11 +1388,6 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, EXT3_I(dir)->i_flags |= EXT3_INDEX_FL; data1 = bh2->b_data; - /* The 0th block becomes the root, move the dirents out */ - fde = &root->dotdot; - de = (struct ext3_dir_entry_2 *)((char *)fde + - ext3_rec_len_from_disk(fde->rec_len)); - len = ((char *) root) + blocksize - (char *) de; memcpy (data1, de, len); de = (struct ext3_dir_entry_2 *) data1; top = data1 + len; From 08ec8c3878cea0bf91f2ba3c0badf44b383752d0 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 16 Jan 2009 11:57:00 -0500 Subject: [PATCH 05/60] jbd2: On a __journal_expect() assertion failure printk "JBD2", not "EXT3-fs" Otherwise it can be very confusing to find a "EXT3-fs: " failure in the middle of EXT4-fs failures, and it makes it harder to track the source of the failure. Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index b45109c61fba..b28b37eb11c6 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -308,7 +308,8 @@ void buffer_assertion_failure(struct buffer_head *bh); int val = (expr); \ if (!val) { \ printk(KERN_ERR \ - "EXT3-fs unexpected failure: %s;\n",# expr); \ + "JBD2 unexpected failure: %s: %s;\n", \ + __func__, #expr); \ printk(KERN_ERR why "\n"); \ } \ val; \ From e7f07968c16bdd9480001c0a9de013ba56889cf9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 20 Jan 2009 09:50:19 -0500 Subject: [PATCH 06/60] ext4: Fix ext4_free_blocks() w/o a journal when files have indirect blocks When trying to unlink a file with indirect blocks on a filesystem without a journal, the "circular indirect block" sanity test was getting falsely triggered. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 49484ba801c9..b4386dafeb0c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3622,7 +3622,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, * block pointed to itself, it would have been detached when * the block was cleared. Check for this instead of OOPSing. */ - if (bh2jh(this_bh)) + if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh)) ext4_handle_dirty_metadata(handle, inode, this_bh); else ext4_error(inode->i_sb, __func__, From fdff73f094e7220602cc3f8959c7230517976412 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 26 Jan 2009 19:06:41 -0500 Subject: [PATCH 07/60] ext4: Initialize the new group descriptor when resizing the filesystem Make sure all of the fields of the group descriptor are properly initialized. Previously, we allowed bg_flags field to be contain random garbage, which could trigger non-deterministic behavior, including a kernel OOPS. http://bugzilla.kernel.org/show_bug.cgi?id=12433 Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/resize.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index c328be5d6885..c06886abd658 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -861,12 +861,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); + memset(gdp, 0, EXT4_DESC_SIZE(sb)); ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ ext4_free_blks_set(sb, gdp, input->free_blocks_count); ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); - gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); + gdp->bg_flags = cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* From 9fd9784c91db79e953ea3fe3741f885bdc390a72 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 26 Jan 2009 19:26:26 -0500 Subject: [PATCH 08/60] ext4: Fix building with EXT4FS_DEBUG When bg_free_blocks_count was renamed to bg_free_blocks_count_lo in 560671a0, its uses under EXT4FS_DEBUG were not changed to the helper ext4_free_blks_count. Another commit, 498e5f24, also did not change everything needed under EXT4FS_DEBUG, thus making it spill some warnings related to printing format. This commit fixes both issues and makes ext4 build again when EXT4FS_DEBUG is enabled. Signed-off-by: Thadeu Lima de Souza Cascardo Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 6 +++--- fs/ext4/extents.c | 2 +- fs/ext4/mballoc.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 6bba06b09dd1..9a50b8052dcf 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -684,15 +684,15 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + desc_count += ext4_free_blks_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (bitmap_bh == NULL) continue; x = ext4_count_free(bitmap_bh, sb->s_blocksize); - printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", - i, le16_to_cpu(gdp->bg_free_blocks_count), x); + printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", + i, ext4_free_blks_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54bf0623a9ae..e2eab196875f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3048,7 +3048,7 @@ retry: WARN_ON(ret <= 0); printk(KERN_ERR "%s: ext4_ext_get_blocks " "returned error inode#%lu, block=%u, " - "max_blocks=%lu", __func__, + "max_blocks=%u", __func__, inode->i_ino, block, max_blocks); #endif ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 918aec0c8a11..deba54f6cbed 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3025,7 +3025,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, goto out_err; ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, - gdp->bg_free_blocks_count); + ext4_free_blks_count(sb, gdp)); err = ext4_journal_get_write_access(handle, gdp_bh); if (err) From 8527bec548e01a29c6d1928d20d6d3be71861482 Mon Sep 17 00:00:00 2001 From: "Ira W. Snyder" Date: Mon, 26 Jan 2009 21:00:33 -0800 Subject: [PATCH 09/60] virtio_net: use correct accessors for scatterlists Without this fix, virtio_net makes incorrect usage of scatterlists. It sets the end of the scatterlist chain after the first element, despite the fact that more entries come after it. If you try to run dma_map_sg() on one of the scatterlists given to you by add_buf(), you will get a null pointer oops. Signed-off-by: Ira W. Snyder Signed-off-by: Rusty Russell Signed-off-by: David S. Miller --- drivers/net/virtio_net.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 63ef2a8905fb..c68808336c8c 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -287,7 +287,7 @@ static void try_fill_recv_maxbufs(struct virtnet_info *vi) skb_put(skb, MAX_PACKET_LEN); hdr = skb_vnet_hdr(skb); - sg_init_one(sg, hdr, sizeof(*hdr)); + sg_set_buf(sg, hdr, sizeof(*hdr)); if (vi->big_packets) { for (i = 0; i < MAX_SKB_FRAGS; i++) { @@ -488,9 +488,9 @@ static int xmit_skb(struct virtnet_info *vi, struct sk_buff *skb) /* Encode metadata header at front. */ if (vi->mergeable_rx_bufs) - sg_init_one(sg, mhdr, sizeof(*mhdr)); + sg_set_buf(sg, mhdr, sizeof(*mhdr)); else - sg_init_one(sg, hdr, sizeof(*hdr)); + sg_set_buf(sg, hdr, sizeof(*hdr)); num = skb_to_sgvec(skb, sg+1, 0, skb->len) + 1; From 98322f22eca889478045cf896b572250d03dc45f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 26 Jan 2009 21:35:35 -0800 Subject: [PATCH 10/60] udp: optimize bind(0) if many ports are in use commit 9088c5609584684149f3fb5b065aa7f18dcb03ff (udp: Improve port randomization) introduced a regression for UDP bind() syscall to null port (getting a random port) in case lot of ports are already in use. This is because we do about 28000 scans of very long chains (220 sockets per chain), with many spin_lock_bh()/spin_unlock_bh() calls. Fix this using a bitmap (64 bytes for current value of UDP_HTABLE_SIZE) so that we scan chains at most once. Instead of 250 ms per bind() call, we get after patch a time of 2.9 ms Based on a report from Vitaly Mayatskikh Reported-by: Vitaly Mayatskikh Signed-off-by: Eric Dumazet Tested-by: Vitaly Mayatskikh Signed-off-by: David S. Miller --- net/ipv4/udp.c | 55 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 39 insertions(+), 16 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf5ab0581eba..b7faffe5c029 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -120,8 +120,11 @@ EXPORT_SYMBOL(sysctl_udp_wmem_min); atomic_t udp_memory_allocated; EXPORT_SYMBOL(udp_memory_allocated); +#define PORTS_PER_CHAIN (65536 / UDP_HTABLE_SIZE) + static int udp_lib_lport_inuse(struct net *net, __u16 num, const struct udp_hslot *hslot, + unsigned long *bitmap, struct sock *sk, int (*saddr_comp)(const struct sock *sk1, const struct sock *sk2)) @@ -132,12 +135,17 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num, sk_nulls_for_each(sk2, node, &hslot->head) if (net_eq(sock_net(sk2), net) && sk2 != sk && - sk2->sk_hash == num && + (bitmap || sk2->sk_hash == num) && (!sk2->sk_reuse || !sk->sk_reuse) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - (*saddr_comp)(sk, sk2)) - return 1; + (*saddr_comp)(sk, sk2)) { + if (bitmap) + __set_bit(sk2->sk_hash / UDP_HTABLE_SIZE, + bitmap); + else + return 1; + } return 0; } @@ -160,32 +168,47 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, if (!snum) { int low, high, remaining; unsigned rand; - unsigned short first; + unsigned short first, last; + DECLARE_BITMAP(bitmap, PORTS_PER_CHAIN); inet_get_local_port_range(&low, &high); remaining = (high - low) + 1; rand = net_random(); - snum = first = rand % remaining + low; - rand |= 1; - for (;;) { - hslot = &udptable->hash[udp_hashfn(net, snum)]; + first = (((u64)rand * remaining) >> 32) + low; + /* + * force rand to be an odd multiple of UDP_HTABLE_SIZE + */ + rand = (rand | 1) * UDP_HTABLE_SIZE; + for (last = first + UDP_HTABLE_SIZE; first != last; first++) { + hslot = &udptable->hash[udp_hashfn(net, first)]; + bitmap_zero(bitmap, PORTS_PER_CHAIN); spin_lock_bh(&hslot->lock); - if (!udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp)) - break; - spin_unlock_bh(&hslot->lock); + udp_lib_lport_inuse(net, snum, hslot, bitmap, sk, + saddr_comp); + + snum = first; + /* + * Iterate on all possible values of snum for this hash. + * Using steps of an odd multiple of UDP_HTABLE_SIZE + * give us randomization and full range coverage. + */ do { - snum = snum + rand; - } while (snum < low || snum > high); - if (snum == first) - goto fail; + if (low <= snum && snum <= high && + !test_bit(snum / UDP_HTABLE_SIZE, bitmap)) + goto found; + snum += rand; + } while (snum != first); + spin_unlock_bh(&hslot->lock); } + goto fail; } else { hslot = &udptable->hash[udp_hashfn(net, snum)]; spin_lock_bh(&hslot->lock); - if (udp_lib_lport_inuse(net, snum, hslot, sk, saddr_comp)) + if (udp_lib_lport_inuse(net, snum, hslot, NULL, sk, saddr_comp)) goto fail_unlock; } +found: inet_sk(sk)->num = snum; sk->sk_hash = snum; if (sk_unhashed(sk)) { From a7a41acf99d9150b424839b0d7b4f5ad9d211e2d Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Mon, 26 Jan 2009 21:54:21 -0800 Subject: [PATCH 11/60] r6040: Remove unused variable pdev from drivers/net/r6040.c drivers/net/r6040.c:441: warning: unused variable 'pdev' Signed-off-by: Manish Katiyar Signed-off-by: David S. Miller --- drivers/net/r6040.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/r6040.c b/drivers/net/r6040.c index 72fd9e97c190..b2dcdb5ed8bd 100644 --- a/drivers/net/r6040.c +++ b/drivers/net/r6040.c @@ -438,7 +438,6 @@ static void r6040_down(struct net_device *dev) { struct r6040_private *lp = netdev_priv(dev); void __iomem *ioaddr = lp->base; - struct pci_dev *pdev = lp->pdev; int limit = 2048; u16 *adrp; u16 cmd; From 9fa5fdf291c9b58b1cb8b4bb2a0ee57efa21d635 Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Mon, 26 Jan 2009 22:15:31 -0800 Subject: [PATCH 12/60] tcp: Fix length tcp_splice_data_recv passes to skb_splice_bits. tcp_splice_data_recv has two lengths to consider: the len parameter it gets from tcp_read_sock, which specifies the amount of data in the skb, and rd_desc->count, which is the amount of data the splice caller still wants. Currently it passes just the latter to skb_splice_bits, which then splices min(rd_desc->count, skb->len - offset) bytes. Most of the time this is fine, except when the skb contains urgent data. In that case len goes only up to the urgent byte and is less than skb->len - offset. By ignoring len tcp_splice_data_recv may a) splice data tcp_read_sock told it not to, b) return to tcp_read_sock a value > len. Now, tcp_read_sock doesn't handle used > len and leaves the socket in a bad state (both sk_receive_queue and copied_seq are bad at that point) resulting in duplicated data and corruption. Fix by passing min(rd_desc->count, len) to skb_splice_bits. Signed-off-by: Dimitris Michailidis Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0cd71b84e483..76b148bcb0dc 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -524,7 +524,8 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, struct tcp_splice_state *tss = rd_desc->arg.data; int ret; - ret = skb_splice_bits(skb, offset, tss->pipe, rd_desc->count, tss->flags); + ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), + tss->flags); if (ret > 0) rd_desc->count -= ret; return ret; From 15b2bee22a0390d951301b53e83df88d0350c499 Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 27 Jan 2009 16:41:58 -0800 Subject: [PATCH 13/60] e1000: fix bug with shared interrupt during reset A nasty bug was found where an MTU change (or anything else that caused a reset) could race with the interrupt code. The interrupt code was entered by a shared interrupt during the MTU change. This change prevents the interrupt code from running while the driver is in the middle of its reset path. Signed-off-by: Jesse Brandeburg Signed-off-by: David S. Miller --- drivers/net/e1000/e1000_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/e1000/e1000_main.c b/drivers/net/e1000/e1000_main.c index 26474c92193f..c986978ce761 100644 --- a/drivers/net/e1000/e1000_main.c +++ b/drivers/net/e1000/e1000_main.c @@ -31,7 +31,7 @@ char e1000_driver_name[] = "e1000"; static char e1000_driver_string[] = "Intel(R) PRO/1000 Network Driver"; -#define DRV_VERSION "7.3.20-k3-NAPI" +#define DRV_VERSION "7.3.21-k3-NAPI" const char e1000_driver_version[] = DRV_VERSION; static const char e1000_copyright[] = "Copyright (c) 1999-2006 Intel Corporation."; @@ -3712,7 +3712,7 @@ static irqreturn_t e1000_intr(int irq, void *data) struct e1000_hw *hw = &adapter->hw; u32 rctl, icr = er32(ICR); - if (unlikely(!icr)) + if (unlikely((!icr) || test_bit(__E1000_RESETTING, &adapter->flags))) return IRQ_NONE; /* Not our interrupt */ /* IMS will not auto-mask if INT_ASSERTED is not set, and if it is From 94cd3e6cbebf85903b4d53ed2147bdb4c6e08625 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 27 Jan 2009 17:45:10 -0800 Subject: [PATCH 14/60] net: wrong test in inet_ehash_locks_alloc() In commit 9db66bdcc83749affe61c61eb8ff3cf08f42afec (net: convert TCP/DCCP ehash rwlocks to spinlocks), I forgot to change one occurrence of rwlock_t to spinlock_t I believe sizeof(raw_spinlock_t) might be > 0 on !CONFIG_SMP if CONFIG_DEBUG_SPINLOCK while sizeof(raw_rwlock_t) should be 0 in this case. Fortunatly, CONFIG_DEBUG_SPINLOCK adds fields to both spinlock_t and rwlock_t, but at this might change in the future (being able to debug spinlocks but not rwlocks for example), better to be safe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/inet_hashtables.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index f44bb5c77a70..d0a043153cc6 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -182,7 +182,7 @@ static inline int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) size = 2048; if (nr_pcpus >= 32) size = 4096; - if (sizeof(rwlock_t) != 0) { + if (sizeof(spinlock_t) != 0) { #ifdef CONFIG_NUMA if (size * sizeof(spinlock_t) > PAGE_SIZE) hashinfo->ehash_locks = vmalloc(size * sizeof(spinlock_t)); From 6c06a478c9e59d1584a5dc1b2b3519bae5d6546a Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 27 Jan 2009 22:30:19 -0800 Subject: [PATCH 15/60] net: fix xfrm reverse flow lookup for icmp6 This patch fixes the xfrm reverse flow lookup for icmp6 so that icmp6 packets don't get lost over ipsec tunnels. Similar patch is in RHEL5 kernel for a quite long time and I do not see why it isn't in mainline. Signed-off-by: Jiri Pirko Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/ipv6/icmp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 4f433847d95f..36dff8807183 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -443,10 +443,10 @@ void icmpv6_send(struct sk_buff *skb, int type, int code, __u32 info, if (xfrm_decode_session_reverse(skb, &fl2, AF_INET6)) goto relookup_failed; - if (ip6_dst_lookup(sk, &dst2, &fl)) + if (ip6_dst_lookup(sk, &dst2, &fl2)) goto relookup_failed; - err = xfrm_lookup(net, &dst2, &fl, sk, XFRM_LOOKUP_ICMP); + err = xfrm_lookup(net, &dst2, &fl2, sk, XFRM_LOOKUP_ICMP); switch (err) { case 0: dst_release(dst); From 1d6e55f195128813f96458203a9fa14204f9251e Mon Sep 17 00:00:00 2001 From: Thomas Goff Date: Tue, 27 Jan 2009 22:39:59 -0800 Subject: [PATCH 16/60] IPv6: Fix multicast routing bugs. This patch addresses the IPv6 multicast routing issues described below. It was tested with XORP 1.4/1.5 as the IPv6 PIM-SM routing daemon against FreeBSD peers. net/ipv6/ip6_input.c: - Don't try to forward link-local multicast packets. - Don't reset skb2->dev before calling ip6_mr_input() so packets can be identified as coming from the PIM register vif properly. net/ipv6/ip6mr.c: - Fix incoming PIM register messages processing: * The IPv6 pseudo-header should be included when checksumming PIM messages (RFC 4601 section 4.9; RFC 3973 section 4.7.1). * Packets decapsulated from PIM register messages should have skb->protocol ETH_P_IPV6. - Enable/disable IPv6 multicast forwarding on the corresponding interface when a routing daemon adds/removes a multicast virtual interface. - Remove incorrect skb_pull() to fix userspace signaling. - Enable/disable global IPv6 multicast forwarding when an IPv6 multicast routing socket is opened/closed. net/ipv6/route.c: - Don't use strict routing logic for packets decapsulated from PIM register messages (similar to disabling rp_filter for the IPv4 case). Signed-off-by: Thomas Goff Reviewed-by: Fred Templin Signed-off-by: David S. Miller --- net/ipv6/ip6_input.c | 2 +- net/ipv6/ip6mr.c | 23 ++++++++++++++++++----- net/ipv6/route.c | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c index 936f48946e20..f171e8dbac91 100644 --- a/net/ipv6/ip6_input.c +++ b/net/ipv6/ip6_input.c @@ -255,6 +255,7 @@ int ip6_mc_input(struct sk_buff *skb) * IPv6 multicast router mode is now supported ;) */ if (dev_net(skb->dev)->ipv6.devconf_all->mc_forwarding && + !(ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) && likely(!(IP6CB(skb)->flags & IP6SKB_FORWARDED))) { /* * Okay, we try to forward - split and duplicate @@ -316,7 +317,6 @@ int ip6_mc_input(struct sk_buff *skb) } if (skb2) { - skb2->dev = skb2->dst->dev; ip6_mr_input(skb2); } } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 3c51b2d827f4..d19a84b79503 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -365,7 +365,9 @@ static int pim6_rcv(struct sk_buff *skb) pim = (struct pimreghdr *)skb_transport_header(skb); if (pim->type != ((PIM_VERSION << 4) | PIM_REGISTER) || (pim->flags & PIM_NULL_REGISTER) || - (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && + (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, + sizeof(*pim), IPPROTO_PIM, + csum_partial((void *)pim, sizeof(*pim), 0)) && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; @@ -392,7 +394,7 @@ static int pim6_rcv(struct sk_buff *skb) skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->dev = reg_dev; - skb->protocol = htons(ETH_P_IP); + skb->protocol = htons(ETH_P_IPV6); skb->ip_summed = 0; skb->pkt_type = PACKET_HOST; dst_release(skb->dst); @@ -481,6 +483,7 @@ static int mif6_delete(struct net *net, int vifi) { struct mif_device *v; struct net_device *dev; + struct inet6_dev *in6_dev; if (vifi < 0 || vifi >= net->ipv6.maxvif) return -EADDRNOTAVAIL; @@ -513,6 +516,10 @@ static int mif6_delete(struct net *net, int vifi) dev_set_allmulti(dev, -1); + in6_dev = __in6_dev_get(dev); + if (in6_dev) + in6_dev->cnf.mc_forwarding--; + if (v->flags & MIFF_REGISTER) unregister_netdevice(dev); @@ -622,6 +629,7 @@ static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock) int vifi = vifc->mif6c_mifi; struct mif_device *v = &net->ipv6.vif6_table[vifi]; struct net_device *dev; + struct inet6_dev *in6_dev; int err; /* Is vif busy ? */ @@ -662,6 +670,10 @@ static int mif6_add(struct net *net, struct mif6ctl *vifc, int mrtsock) return -EINVAL; } + in6_dev = __in6_dev_get(dev); + if (in6_dev) + in6_dev->cnf.mc_forwarding++; + /* * Fill in the VIF structures */ @@ -838,8 +850,6 @@ static int ip6mr_cache_report(struct net *net, struct sk_buff *pkt, mifi_t mifi, skb->dst = dst_clone(pkt->dst); skb->ip_summed = CHECKSUM_UNNECESSARY; - - skb_pull(skb, sizeof(struct ipv6hdr)); } if (net->ipv6.mroute6_sk == NULL) { @@ -1222,8 +1232,10 @@ static int ip6mr_sk_init(struct sock *sk) rtnl_lock(); write_lock_bh(&mrt_lock); - if (likely(net->ipv6.mroute6_sk == NULL)) + if (likely(net->ipv6.mroute6_sk == NULL)) { net->ipv6.mroute6_sk = sk; + net->ipv6.devconf_all->mc_forwarding++; + } else err = -EADDRINUSE; write_unlock_bh(&mrt_lock); @@ -1242,6 +1254,7 @@ int ip6mr_sk_done(struct sock *sk) if (sk == net->ipv6.mroute6_sk) { write_lock_bh(&mrt_lock); net->ipv6.mroute6_sk = NULL; + net->ipv6.devconf_all->mc_forwarding--; write_unlock_bh(&mrt_lock); mroute_clean_tables(net); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index c4a59824ac2c..9c574235c905 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -794,7 +794,7 @@ void ip6_route_input(struct sk_buff *skb) .proto = iph->nexthdr, }; - if (rt6_need_strict(&iph->daddr)) + if (rt6_need_strict(&iph->daddr) && skb->dev->type != ARPHRD_PIMREG) flags |= RT6_LOOKUP_F_IFACE; skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input); From a4e6db07984529847c6ad8bc616485e721dcb809 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Jan 2009 22:41:03 -0800 Subject: [PATCH 17/60] ipv6: Make mc_forwarding sysctl read-only. The kernel manages this value internally, as necessary, as VIFs are added/removed and as multicast routers are registered and deregistered. Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e92ad8455c63..f9afb452249c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -4250,7 +4250,7 @@ static struct addrconf_sysctl_table .procname = "mc_forwarding", .data = &ipv6_devconf.mc_forwarding, .maxlen = sizeof(int), - .mode = 0644, + .mode = 0444, .proc_handler = proc_dointvec, }, #endif From e6a271651e9e7810c1802bb8375967f6efa4baea Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 26 Jan 2009 19:35:28 +0100 Subject: [PATCH 18/60] mac80211: remove Michael Wu as maintainer His email address keeps bouncing, and he's not interested in mac80211 patches etc. anyway. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- MAINTAINERS | 2 -- 1 file changed, 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index d992d407197b..474ec0c53272 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2836,8 +2836,6 @@ S: Maintained MAC80211 P: Johannes Berg M: johannes@sipsolutions.net -P: Michael Wu -M: flamingice@sourmilk.net L: linux-wireless@vger.kernel.org W: http://linuxwireless.org/ T: git kernel.org:/pub/scm/linux/kernel/git/linville/wireless-2.6.git From eb83bbf57429ab80f49b413e3e44d3b19c3fdc5a Mon Sep 17 00:00:00 2001 From: Larry Finger Date: Tue, 27 Jan 2009 12:31:23 -0600 Subject: [PATCH 19/60] rtl8187: Fix error in setting OFDM power settings for RTL8187L MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After reports of poor performance, a review of the latest vendor driver (rtl8187_linux_26.1025.0328.2007) for RTL8187L devices was undertaken. A difference was found in the code used to index the OFDM power tables. When the Linux driver was changed, my unit works at a much greater range than before. I think this fixes Bugzilla #12380 and has been tested by at least two other users. Signed-off-by: Larry Finger Tested-by: Martín Ernesto Barreyro Cc: Stable Signed-off-by: John W. Linville --- drivers/net/wireless/rtl818x/rtl8187_rtl8225.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/rtl818x/rtl8187_rtl8225.c b/drivers/net/wireless/rtl818x/rtl8187_rtl8225.c index 4e75e8e7fa90..78df281b297a 100644 --- a/drivers/net/wireless/rtl818x/rtl8187_rtl8225.c +++ b/drivers/net/wireless/rtl818x/rtl8187_rtl8225.c @@ -285,7 +285,10 @@ static void rtl8225_rf_set_tx_power(struct ieee80211_hw *dev, int channel) ofdm_power = priv->channels[channel - 1].hw_value >> 4; cck_power = min(cck_power, (u8)11); - ofdm_power = min(ofdm_power, (u8)35); + if (ofdm_power > (u8)15) + ofdm_power = 25; + else + ofdm_power += 10; rtl818x_iowrite8(priv, &priv->map->TX_GAIN_CCK, rtl8225_tx_gain_cck_ofdm[cck_power / 6] >> 1); @@ -536,7 +539,10 @@ static void rtl8225z2_rf_set_tx_power(struct ieee80211_hw *dev, int channel) cck_power += priv->txpwr_base & 0xF; cck_power = min(cck_power, (u8)35); - ofdm_power = min(ofdm_power, (u8)15); + if (ofdm_power > (u8)15) + ofdm_power = 25; + else + ofdm_power += 10; ofdm_power += priv->txpwr_base >> 4; ofdm_power = min(ofdm_power, (u8)35); From 1f304e4e3bb161163d9f5bc3c6467a2a6fa9b3ae Mon Sep 17 00:00:00 2001 From: "Zhu, Yi" Date: Fri, 23 Jan 2009 13:45:22 -0800 Subject: [PATCH 20/60] iwlwifi: fix kernel oops when ucode DMA memory allocation failure The patch fixes memcpy to NULL address when the ucode DMA allocation failure. This is a fix to bug http://www.intellinuxwireless.org/bugzilla/show_bug.cgi?id=1861 Signed-off-by: Zhu Yi Signed-off-by: Reinette Chatre Signed-off-by: John W. Linville --- drivers/net/wireless/iwlwifi/iwl-agn.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c index 0dc8eed16404..b35c8813bef4 100644 --- a/drivers/net/wireless/iwlwifi/iwl-agn.c +++ b/drivers/net/wireless/iwlwifi/iwl-agn.c @@ -1719,6 +1719,10 @@ static int iwl_read_ucode(struct iwl_priv *priv) priv->ucode_data_backup.len = data_size; iwl_alloc_fw_desc(priv->pci_dev, &priv->ucode_data_backup); + if (!priv->ucode_code.v_addr || !priv->ucode_data.v_addr || + !priv->ucode_data_backup.v_addr) + goto err_pci_alloc; + /* Initialization instructions and data */ if (init_size && init_data_size) { priv->ucode_init.len = init_size; From 615aab4b75dfa77b00c372330d6f70edd2458bf9 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 22 Jan 2009 15:05:46 -0800 Subject: [PATCH 21/60] cfg80211: Fix sanity check on 5 GHz when processing country IE This fixes two issues with the sanity check loop when processing the country IE: 1. Do not use frequency for the current subband channel check, this was a big fat typo. 2. Apply the 5 GHz 4-channel steps when considering max channel on each subband as was done with a recent patch. Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/wireless/reg.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bc494cef2102..61698095e1ad 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -498,6 +498,7 @@ static struct ieee80211_regdomain *country_ie_2_rd( * calculate the number of reg rules we will need. We will need one * for each channel subband */ while (country_ie_len >= 3) { + int end_channel = 0; struct ieee80211_country_ie_triplet *triplet = (struct ieee80211_country_ie_triplet *) country_ie; int cur_sub_max_channel = 0, cur_channel = 0; @@ -509,9 +510,25 @@ static struct ieee80211_regdomain *country_ie_2_rd( continue; } + /* 2 GHz */ + if (triplet->chans.first_channel <= 14) + end_channel = triplet->chans.first_channel + + triplet->chans.num_channels; + else + /* + * 5 GHz -- For example in country IEs if the first + * channel given is 36 and the number of channels is 4 + * then the individual channel numbers defined for the + * 5 GHz PHY by these parameters are: 36, 40, 44, and 48 + * and not 36, 37, 38, 39. + * + * See: http://tinyurl.com/11d-clarification + */ + end_channel = triplet->chans.first_channel + + (4 * (triplet->chans.num_channels - 1)); + cur_channel = triplet->chans.first_channel; - cur_sub_max_channel = ieee80211_channel_to_frequency( - cur_channel + triplet->chans.num_channels); + cur_sub_max_channel = end_channel; /* Basic sanity check */ if (cur_sub_max_channel < cur_channel) @@ -590,15 +607,6 @@ static struct ieee80211_regdomain *country_ie_2_rd( end_channel = triplet->chans.first_channel + triplet->chans.num_channels; else - /* - * 5 GHz -- For example in country IEs if the first - * channel given is 36 and the number of channels is 4 - * then the individual channel numbers defined for the - * 5 GHz PHY by these parameters are: 36, 40, 44, and 48 - * and not 36, 37, 38, 39. - * - * See: http://tinyurl.com/11d-clarification - */ end_channel = triplet->chans.first_channel + (4 * (triplet->chans.num_channels - 1)); From 667ecd010d870f861a9e276aaaca8cb443ded8b3 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 22 Jan 2009 15:05:43 -0800 Subject: [PATCH 22/60] cfg80211: print correct intersected regulatory domain When CONFIG_CFG80211_REG_DEBUG is enabled and an intersection occurs we are printing the regulatory domain passed by CRDA and indicating its the intersected regulatory domain. Lets fix this and print the intersection as originally intended. Signed-off-by: Luis R. Rodriguez Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/reg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index 61698095e1ad..85c9034c59b2 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -1284,7 +1284,7 @@ static void reg_country_ie_process_debug( if (intersected_rd) { printk(KERN_DEBUG "cfg80211: We intersect both of these " "and get:\n"); - print_regdomain_info(rd); + print_regdomain_info(intersected_rd); return; } printk(KERN_DEBUG "cfg80211: Intersection between both failed\n"); From be0093705c3ce98d73cac0ca8f93ea105cdf4e9c Mon Sep 17 00:00:00 2001 From: Bob Copeland Date: Thu, 22 Jan 2009 08:44:16 -0500 Subject: [PATCH 23/60] ath5k: fix locking in ath5k_config ath5k_config updates the software context without taking sc->lock. Changes-licensed-under: 3-Clause-BSD Signed-off-by: Bob Copeland Acked-by: Nick Kossifidis Signed-off-by: John W. Linville --- drivers/net/wireless/ath5k/base.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath5k/base.c b/drivers/net/wireless/ath5k/base.c index 8ef87356e083..a533ed60bb4d 100644 --- a/drivers/net/wireless/ath5k/base.c +++ b/drivers/net/wireless/ath5k/base.c @@ -1028,6 +1028,8 @@ ath5k_setup_bands(struct ieee80211_hw *hw) * it's done by reseting the chip. To accomplish this we must * first cleanup any pending DMA, then restart stuff after a la * ath5k_init. + * + * Called with sc->lock. */ static int ath5k_chan_set(struct ath5k_softc *sc, struct ieee80211_channel *chan) @@ -2814,11 +2816,17 @@ ath5k_config(struct ieee80211_hw *hw, u32 changed) { struct ath5k_softc *sc = hw->priv; struct ieee80211_conf *conf = &hw->conf; + int ret; + + mutex_lock(&sc->lock); sc->bintval = conf->beacon_int; sc->power_level = conf->power_level; - return ath5k_chan_set(sc, conf->channel); + ret = ath5k_chan_set(sc, conf->channel); + + mutex_unlock(&sc->lock); + return ret; } static int From e125646ab56b490d0390b158e0afa9cccfc1f897 Mon Sep 17 00:00:00 2001 From: Dhananjay Phadke Date: Thu, 29 Jan 2009 16:05:19 -0800 Subject: [PATCH 24/60] netxen: revert jumbo ringsize Reducing jumbo ring size below 1024 reduces throughput for old firmwares (3.4.216 and older) running on older (NX2031) chip, so restore it back to 1024. This was reduced in commit 32ec803348b4d5f1353e1d7feae30880b8b3e342 ("netxen: reduce memory footprint"). Raising jumbo ring size from 512 to 1024, adds ~4MB per port, but there's still big saving because of original patch (~20MB per port). Signed-off-by: Dhananjay Phadke Signed-off-by: David S. Miller --- drivers/net/netxen/netxen_nic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/netxen/netxen_nic.h b/drivers/net/netxen/netxen_nic.h index a75a31005fd3..9c78c963b721 100644 --- a/drivers/net/netxen/netxen_nic.h +++ b/drivers/net/netxen/netxen_nic.h @@ -210,7 +210,7 @@ #define MAX_CMD_DESCRIPTORS_HOST 1024 #define MAX_RCV_DESCRIPTORS_1G 2048 #define MAX_RCV_DESCRIPTORS_10G 4096 -#define MAX_JUMBO_RCV_DESCRIPTORS 512 +#define MAX_JUMBO_RCV_DESCRIPTORS 1024 #define MAX_LRO_RCV_DESCRIPTORS 8 #define MAX_RCVSTATUS_DESCRIPTORS MAX_RCV_DESCRIPTORS #define MAX_JUMBO_RCV_DESC MAX_JUMBO_RCV_DESCRIPTORS From 95e3b24cfb4ec0479d2c42f7a1780d68063a542a Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 29 Jan 2009 16:07:52 -0800 Subject: [PATCH 25/60] net: Fix frag_list handling in skb_seq_read The frag_list handling was broken in skb_seq_read: 1) We didn't add the stepped offset when looking at the head are of fragments other than the first. 2) We didn't take the stepped offset away when setting the data pointer in the head area. 3) The frag index wasn't reset. This patch fixes both issues. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 2e5f2ca3bdcd..f23fd43539ed 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2212,10 +2212,10 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data, return 0; next_skb: - block_limit = skb_headlen(st->cur_skb); + block_limit = skb_headlen(st->cur_skb) + st->stepped_offset; if (abs_offset < block_limit) { - *data = st->cur_skb->data + abs_offset; + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); return block_limit - abs_offset; } @@ -2257,6 +2257,7 @@ next_skb: } else if (st->root_skb == st->cur_skb && skb_shinfo(st->root_skb)->frag_list) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + st->frag_idx = 0; goto next_skb; } From 71b3346d182355f19509fadb8fe45114a35cc499 Mon Sep 17 00:00:00 2001 From: Shyam Iyer Date: Thu, 29 Jan 2009 16:12:42 -0800 Subject: [PATCH 26/60] net: Fix OOPS in skb_seq_read(). It oopsd for me in skb_seq_read. addr2line said it was linux-2.6/net/core/skbuff.c:2228, which is this line: while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) { I added some printks in there and it looks like we hit this: } else if (st->root_skb == st->cur_skb && skb_shinfo(st->root_skb)->frag_list) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; } Actually I did some testing and added a few printks and found that the st->cur_skb->data was 0 and hence the ptr used by iscsi_tcp was null. This caused the kernel panic. if (abs_offset < block_limit) { - *data = st->cur_skb->data + abs_offset; + *data = st->cur_skb->data + (abs_offset - st->stepped_offset); I enabled the debug_tcp and with a few printks found that the code did not go to the next_skb label and could find that the sequence being followed was this - It hit this if condition - if (st->cur_skb->next) { st->cur_skb = st->cur_skb->next; st->frag_idx = 0; goto next_skb; And so, now the st pointer is shifted to the next skb whereas actually it should have hit the second else if first since the data is in the frag_list. else if (st->root_skb == st->cur_skb && skb_shinfo(st->root_skb)->frag_list) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; goto next_skb; } Reversing the two conditions the attached patch fixes the issue for me on top of Herbert's patches. Signed-off-by: Shyam Iyer Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/skbuff.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index f23fd43539ed..da74b844f4ea 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2250,13 +2250,13 @@ next_skb: st->frag_data = NULL; } - if (st->cur_skb->next) { - st->cur_skb = st->cur_skb->next; + if (st->root_skb == st->cur_skb && + skb_shinfo(st->root_skb)->frag_list) { + st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; - } else if (st->root_skb == st->cur_skb && - skb_shinfo(st->root_skb)->frag_list) { - st->cur_skb = skb_shinfo(st->root_skb)->frag_list; + } else if (st->cur_skb->next) { + st->cur_skb = st->cur_skb->next; st->frag_idx = 0; goto next_skb; } From 58092d1e0a896eb1d9163d58f93df7ed704fa8e1 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Thu, 29 Jan 2009 16:16:31 -0800 Subject: [PATCH 27/60] net: update documentation ip aliases This documentation is old. Add a short note to describe why aliases are no long necessary, and remove the old contact/edit info. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- Documentation/networking/alias.txt | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/Documentation/networking/alias.txt b/Documentation/networking/alias.txt index cd12c2ff518a..85046f53fcfc 100644 --- a/Documentation/networking/alias.txt +++ b/Documentation/networking/alias.txt @@ -2,14 +2,14 @@ IP-Aliasing: ============ -IP-aliases are additional IP-addresses/masks hooked up to a base -interface by adding a colon and a string when running ifconfig. +IP-aliases are an obsolete way to manage multiple IP-addresses/masks +per interface. Newer tools such as iproute2 support multiple +address/prefixes per interface, but aliases are still supported +for backwards compatibility. + +An alias is formed by adding a colon and a string when running ifconfig. This string is usually numeric, but this is not a must. -IP-Aliases are avail if CONFIG_INET (`standard' IPv4 networking) -is configured in the kernel. - - o Alias creation. Alias creation is done by 'magic' interface naming: eg. to create a 200.1.1.1 alias for eth0 ... @@ -38,16 +38,3 @@ o Relationship with main device If the base device is shut down the added aliases will be deleted too. - - -Contact -------- -Please finger or e-mail me: - Juan Jose Ciarlante - -Updated by Erik Schoenfelder - -; local variables: -; mode: indented-text -; mode: auto-fill -; end: From 9d8dba6c979fa99c96938c869611b9a23b73efa9 Mon Sep 17 00:00:00 2001 From: Benjamin Zores Date: Thu, 29 Jan 2009 16:19:13 -0800 Subject: [PATCH 28/60] ipv4: fix infinite retry loop in IP-Config Signed-off-by: Benjamin Zores Signed-off-by: David S. Miller --- net/ipv4/ipconfig.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index 42a0f3dd3fd6..d722013c1cae 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -1268,6 +1268,9 @@ __be32 __init root_nfs_parse_addr(char *name) static int __init ip_auto_config(void) { __be32 addr; +#ifdef IPCONFIG_DYNAMIC + int retries = CONF_OPEN_RETRIES; +#endif #ifdef CONFIG_PROC_FS proc_net_fops_create(&init_net, "pnp", S_IRUGO, &pnp_seq_fops); @@ -1304,9 +1307,6 @@ static int __init ip_auto_config(void) #endif ic_first_dev->next) { #ifdef IPCONFIG_DYNAMIC - - int retries = CONF_OPEN_RETRIES; - if (ic_dynamic() < 0) { ic_close_devs(); From df1c46b2b6876d0a1b1b4740f009fa69d95ebbc9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 29 Jan 2009 16:53:35 -0800 Subject: [PATCH 29/60] tun: Add some missing TUN compat ioctl translations. Based upon a report from Michael Tokarev : Just saw in dmesg: ioctl32(kvm:4408): Unknown cmd fd(9) cmd(800454cf){t:'T';sz:4} arg(ffc668e4) on /dev/net/tun Signed-off-by: David S. Miller --- fs/compat_ioctl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 5235c67e7594..c8f8d5904f5e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -538,6 +538,7 @@ static int dev_ifsioc(unsigned int fd, unsigned int cmd, unsigned long arg) * cannot be fixed without breaking all existing apps. */ case TUNSETIFF: + case TUNGETIFF: case SIOCGIFFLAGS: case SIOCGIFMETRIC: case SIOCGIFMTU: @@ -1982,6 +1983,11 @@ COMPATIBLE_IOCTL(TUNSETNOCSUM) COMPATIBLE_IOCTL(TUNSETDEBUG) COMPATIBLE_IOCTL(TUNSETPERSIST) COMPATIBLE_IOCTL(TUNSETOWNER) +COMPATIBLE_IOCTL(TUNSETLINK) +COMPATIBLE_IOCTL(TUNSETGROUP) +COMPATIBLE_IOCTL(TUNGETFEATURES) +COMPATIBLE_IOCTL(TUNSETOFFLOAD) +COMPATIBLE_IOCTL(TUNSETTXFILTER) /* Big V */ COMPATIBLE_IOCTL(VT_SETMODE) COMPATIBLE_IOCTL(VT_GETMODE) @@ -2573,6 +2579,7 @@ HANDLE_IOCTL(SIOCGIFPFLAGS, dev_ifsioc) HANDLE_IOCTL(SIOCGIFTXQLEN, dev_ifsioc) HANDLE_IOCTL(SIOCSIFTXQLEN, dev_ifsioc) HANDLE_IOCTL(TUNSETIFF, dev_ifsioc) +HANDLE_IOCTL(TUNGETIFF, dev_ifsioc) HANDLE_IOCTL(SIOCETHTOOL, ethtool_ioctl) HANDLE_IOCTL(SIOCBONDENSLAVE, bond_ioctl) HANDLE_IOCTL(SIOCBONDRELEASE, bond_ioctl) From 584dbe9475313e117abf9d2af88164edfd429c9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Marjam=C3=A4ki?= Date: Thu, 29 Jan 2009 08:55:56 +0000 Subject: [PATCH 30/60] netxen: fix memory leak in drivers/net/netxen_nic_init.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For kernel bugzilla #12537: http://bugzilla.kernel.org/show_bug.cgi?id=12537 Free memory. Signed-off-by: Daniel Marjamäki Signed-off-by: David S. Miller --- drivers/net/netxen/netxen_nic_init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/netxen/netxen_nic_init.c b/drivers/net/netxen/netxen_nic_init.c index ca7c8d8050c9..ffd37bea1628 100644 --- a/drivers/net/netxen/netxen_nic_init.c +++ b/drivers/net/netxen/netxen_nic_init.c @@ -947,8 +947,10 @@ int netxen_pinit_from_rom(struct netxen_adapter *adapter, int verbose) } for (i = 0; i < n; i++) { if (netxen_rom_fast_read(adapter, 8*i + 4*offset, &val) != 0 || - netxen_rom_fast_read(adapter, 8*i + 4*offset + 4, &addr) != 0) + netxen_rom_fast_read(adapter, 8*i + 4*offset + 4, &addr) != 0) { + kfree(buf); return -EIO; + } buf[i].addr = addr; buf[i].data = val; From 1af7ad51049d6a310a19d497960597198290ddfa Mon Sep 17 00:00:00 2001 From: Inaky Perez-Gonzalez Date: Thu, 29 Jan 2009 17:18:31 -0800 Subject: [PATCH 31/60] wimax: fix build issue when debugfs is disabled MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As reported by Toralf Förster and Randy Dunlap. - http://linuxwimax.org/pipermail/wimax/2009-January/000460.html - http://lkml.org/lkml/2009/1/29/279 The definitions needed for the wimax stack and i2400m driver debug infrastructure was, by mistake, compiled depending on CONFIG_DEBUG_FS (by them being placed in the debugfs.c files); thus the build broke in 2.6.29-rc3 when debugging was enabled (CONFIG_WIMAX_DEBUG) and DEBUG_FS was disabled. These definitions are always needed if debug is enabled at compile time (independently of DEBUG_FS being or not enabled), so moving them to a file that is always compiled fixes the issue. Signed-off-by: Inaky Perez-Gonzalez Signed-off-by: David S. Miller --- drivers/net/wimax/i2400m/debugfs.c | 14 -------------- drivers/net/wimax/i2400m/driver.c | 16 ++++++++++++++++ net/wimax/debugfs.c | 11 ----------- net/wimax/stack.c | 13 +++++++++++++ 4 files changed, 29 insertions(+), 25 deletions(-) diff --git a/drivers/net/wimax/i2400m/debugfs.c b/drivers/net/wimax/i2400m/debugfs.c index 626632985977..9b81af3f80a9 100644 --- a/drivers/net/wimax/i2400m/debugfs.c +++ b/drivers/net/wimax/i2400m/debugfs.c @@ -234,20 +234,6 @@ struct dentry *debugfs_create_i2400m_reset( &fops_i2400m_reset); } -/* - * Debug levels control; see debug.h - */ -struct d_level D_LEVEL[] = { - D_SUBMODULE_DEFINE(control), - D_SUBMODULE_DEFINE(driver), - D_SUBMODULE_DEFINE(debugfs), - D_SUBMODULE_DEFINE(fw), - D_SUBMODULE_DEFINE(netdev), - D_SUBMODULE_DEFINE(rfkill), - D_SUBMODULE_DEFINE(rx), - D_SUBMODULE_DEFINE(tx), -}; -size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); #define __debugfs_register(prefix, name, parent) \ do { \ diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c index 5f98047e18cf..e80a0b65a754 100644 --- a/drivers/net/wimax/i2400m/driver.c +++ b/drivers/net/wimax/i2400m/driver.c @@ -707,6 +707,22 @@ void i2400m_release(struct i2400m *i2400m) EXPORT_SYMBOL_GPL(i2400m_release); +/* + * Debug levels control; see debug.h + */ +struct d_level D_LEVEL[] = { + D_SUBMODULE_DEFINE(control), + D_SUBMODULE_DEFINE(driver), + D_SUBMODULE_DEFINE(debugfs), + D_SUBMODULE_DEFINE(fw), + D_SUBMODULE_DEFINE(netdev), + D_SUBMODULE_DEFINE(rfkill), + D_SUBMODULE_DEFINE(rx), + D_SUBMODULE_DEFINE(tx), +}; +size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); + + static int __init i2400m_driver_init(void) { diff --git a/net/wimax/debugfs.c b/net/wimax/debugfs.c index 87cf4430079c..94d216a46407 100644 --- a/net/wimax/debugfs.c +++ b/net/wimax/debugfs.c @@ -28,17 +28,6 @@ #include "debug-levels.h" -/* Debug framework control of debug levels */ -struct d_level D_LEVEL[] = { - D_SUBMODULE_DEFINE(debugfs), - D_SUBMODULE_DEFINE(id_table), - D_SUBMODULE_DEFINE(op_msg), - D_SUBMODULE_DEFINE(op_reset), - D_SUBMODULE_DEFINE(op_rfkill), - D_SUBMODULE_DEFINE(stack), -}; -size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); - #define __debugfs_register(prefix, name, parent) \ do { \ result = d_level_register_debugfs(prefix, name, parent); \ diff --git a/net/wimax/stack.c b/net/wimax/stack.c index d4da92f8981a..3869c0327882 100644 --- a/net/wimax/stack.c +++ b/net/wimax/stack.c @@ -516,6 +516,19 @@ void wimax_dev_rm(struct wimax_dev *wimax_dev) } EXPORT_SYMBOL_GPL(wimax_dev_rm); + +/* Debug framework control of debug levels */ +struct d_level D_LEVEL[] = { + D_SUBMODULE_DEFINE(debugfs), + D_SUBMODULE_DEFINE(id_table), + D_SUBMODULE_DEFINE(op_msg), + D_SUBMODULE_DEFINE(op_reset), + D_SUBMODULE_DEFINE(op_rfkill), + D_SUBMODULE_DEFINE(stack), +}; +size_t D_LEVEL_SIZE = ARRAY_SIZE(D_LEVEL); + + struct genl_family wimax_gnl_family = { .id = GENL_ID_GENERATE, .name = "WiMAX", From b1c4a9dddf09fe99b8f88252718ac5b357363dc4 Mon Sep 17 00:00:00 2001 From: Haiying Wang Date: Thu, 29 Jan 2009 17:28:04 -0800 Subject: [PATCH 32/60] ucc_geth: Change uec phy id to the same format as gianfar's The commit b31a1d8b41513b96e9c7ec2f68c5734cef0b26a4 ("gianfar: Convert gianfar to an of_platform_driver") changes the gianfar's phy id to the format like "mdio@xxxx:xx", but uec still uses the old format like "xxxxxxxx:xx". For the board whose UEC uses gianfar-mdio like MPC8568MDS, the phy can not be attached because of the incompatible phy id format. This patch changes uec's phy id to the same format as gianfar's. Signed-off-by: Haiying Wang Signed-off-by: David S. Miller --- drivers/net/ucc_geth.c | 20 ++++++++++++++++++-- drivers/net/ucc_geth.h | 2 ++ drivers/net/ucc_geth_mii.c | 12 +++++++++++- drivers/net/ucc_geth_mii.h | 1 + 4 files changed, 32 insertions(+), 3 deletions(-) diff --git a/drivers/net/ucc_geth.c b/drivers/net/ucc_geth.c index 11441225bf41..e87986867ba5 100644 --- a/drivers/net/ucc_geth.c +++ b/drivers/net/ucc_geth.c @@ -1536,6 +1536,11 @@ static void adjust_link(struct net_device *dev) static int init_phy(struct net_device *dev) { struct ucc_geth_private *priv = netdev_priv(dev); + struct device_node *np = priv->node; + struct device_node *phy, *mdio; + const phandle *ph; + char bus_name[MII_BUS_ID_SIZE]; + const unsigned int *id; struct phy_device *phydev; char phy_id[BUS_ID_SIZE]; @@ -1543,8 +1548,18 @@ static int init_phy(struct net_device *dev) priv->oldspeed = 0; priv->oldduplex = -1; - snprintf(phy_id, sizeof(phy_id), PHY_ID_FMT, priv->ug_info->mdio_bus, - priv->ug_info->phy_address); + ph = of_get_property(np, "phy-handle", NULL); + phy = of_find_node_by_phandle(*ph); + mdio = of_get_parent(phy); + + id = of_get_property(phy, "reg", NULL); + + of_node_put(phy); + of_node_put(mdio); + + uec_mdio_bus_name(bus_name, mdio); + snprintf(phy_id, sizeof(phy_id), "%s:%02x", + bus_name, *id); phydev = phy_connect(dev, phy_id, &adjust_link, 0, priv->phy_interface); @@ -3748,6 +3763,7 @@ static int ucc_geth_probe(struct of_device* ofdev, const struct of_device_id *ma ugeth->ug_info = ug_info; ugeth->dev = dev; + ugeth->node = np; return 0; } diff --git a/drivers/net/ucc_geth.h b/drivers/net/ucc_geth.h index 8f699cb773ee..16cbe42ba43c 100644 --- a/drivers/net/ucc_geth.h +++ b/drivers/net/ucc_geth.h @@ -1186,6 +1186,8 @@ struct ucc_geth_private { int oldspeed; int oldduplex; int oldlink; + + struct device_node *node; }; void uec_set_ethtool_ops(struct net_device *netdev); diff --git a/drivers/net/ucc_geth_mii.c b/drivers/net/ucc_geth_mii.c index c001d261366b..54635911305c 100644 --- a/drivers/net/ucc_geth_mii.c +++ b/drivers/net/ucc_geth_mii.c @@ -156,7 +156,7 @@ static int uec_mdio_probe(struct of_device *ofdev, const struct of_device_id *ma if (err) goto reg_map_fail; - snprintf(new_bus->id, MII_BUS_ID_SIZE, "%x", res.start); + uec_mdio_bus_name(new_bus->id, np); new_bus->irq = kmalloc(32 * sizeof(int), GFP_KERNEL); @@ -283,3 +283,13 @@ void uec_mdio_exit(void) { of_unregister_platform_driver(&uec_mdio_driver); } + +void uec_mdio_bus_name(char *name, struct device_node *np) +{ + const u32 *reg; + + reg = of_get_property(np, "reg", NULL); + + snprintf(name, MII_BUS_ID_SIZE, "%s@%x", np->name, reg ? *reg : 0); +} + diff --git a/drivers/net/ucc_geth_mii.h b/drivers/net/ucc_geth_mii.h index 1e45b2028a50..840cf80235b7 100644 --- a/drivers/net/ucc_geth_mii.h +++ b/drivers/net/ucc_geth_mii.h @@ -97,4 +97,5 @@ int uec_mdio_read(struct mii_bus *bus, int mii_id, int regnum); int uec_mdio_write(struct mii_bus *bus, int mii_id, int regnum, u16 value); int __init uec_mdio_init(void); void uec_mdio_exit(void); +void uec_mdio_bus_name(char *name, struct device_node *np); #endif /* __UEC_MII_H */ From 1609559547ae0ddc2e4829c7f78ac2c4869875b9 Mon Sep 17 00:00:00 2001 From: Steve Glendinning Date: Thu, 29 Jan 2009 17:29:15 -0800 Subject: [PATCH 33/60] smsc9420: fix interrupt signalling test failures smsc9420 performs an interrupt signalling test when the interface is brought up. The current code mistakenly sets its test flag to false AFTER enabling the software interrupt source, making failure quite likely. This patch changes the code to set the test flag BEFORE enabling interrupts. I've also removed an smp_wmb because the following spinlock provides an implicit memory barrier. Signed-off-by: Steve Glendinning Signed-off-by: David S. Miller --- drivers/net/smsc9420.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/smsc9420.c b/drivers/net/smsc9420.c index c14a4c6452c7..d801900a5036 100644 --- a/drivers/net/smsc9420.c +++ b/drivers/net/smsc9420.c @@ -1378,6 +1378,7 @@ static int smsc9420_open(struct net_device *dev) /* test the IRQ connection to the ISR */ smsc_dbg(IFUP, "Testing ISR using IRQ %d", dev->irq); + pd->software_irq_signal = false; spin_lock_irqsave(&pd->int_lock, flags); /* configure interrupt deassertion timer and enable interrupts */ @@ -1393,8 +1394,6 @@ static int smsc9420_open(struct net_device *dev) smsc9420_pci_flush_write(pd); timeout = 1000; - pd->software_irq_signal = false; - smp_wmb(); while (timeout--) { if (pd->software_irq_signal) break; From f307dbd88d82c4ccab7aec49613c366023b89cde Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 29 Jan 2009 17:30:00 -0800 Subject: [PATCH 34/60] smsc911x: timeout reaches -1 With a postfix decrement the timeout will reach -1 rather than 0, so the warning will not be issued. Signed-off-by: Roel Kluin Acked-by: Steve Glendinning Signed-off-by: David S. Miller --- drivers/net/smsc911x.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/smsc911x.c b/drivers/net/smsc911x.c index f513bdf1c887..783c1a7b869e 100644 --- a/drivers/net/smsc911x.c +++ b/drivers/net/smsc911x.c @@ -953,7 +953,7 @@ smsc911x_rx_fastforward(struct smsc911x_data *pdata, unsigned int pktbytes) do { udelay(1); val = smsc911x_reg_read(pdata, RX_DP_CTRL); - } while (timeout-- && (val & RX_DP_CTRL_RX_FFWD_)); + } while (--timeout && (val & RX_DP_CTRL_RX_FFWD_)); if (unlikely(timeout == 0)) SMSC_WARNING(HW, "Timed out waiting for " From e5664bb2a7fd8ae1bee1281c2e44653c471af9ca Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 29 Jan 2009 17:31:13 -0800 Subject: [PATCH 35/60] gianfar: Fix Wake-on-LAN support commit 0f0ca340e57bd7446855fefd07a64249acf81223 ("phy: power management support") caused a regression in the gianfar driver. Now phylib turns off PHY power during suspend, and thus WOL doesn't work anymore. This patch workarounds the issue by enabling wakeup in the MDIO device, i.e. just restores the old behaviour for the gianfar driver. Note that this way all PHYs on a given MDIO bus won't be turned off during suspend, which isn't good from the power saving point of view. A proper, per netdevice wakeup management support will need a bit reworked phylib suspend/resume logic. Signed-off-by: Anton Vorontsov Signed-off-by: David S. Miller --- drivers/net/gianfar_mii.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/gianfar_mii.c b/drivers/net/gianfar_mii.c index f3706e415b45..f49a426ad681 100644 --- a/drivers/net/gianfar_mii.c +++ b/drivers/net/gianfar_mii.c @@ -234,6 +234,8 @@ static int gfar_mdio_probe(struct of_device *ofdev, if (NULL == new_bus) return -ENOMEM; + device_init_wakeup(&ofdev->dev, 1); + new_bus->name = "Gianfar MII Bus", new_bus->read = &gfar_mdio_read, new_bus->write = &gfar_mdio_write, From c25b9abbc2c2c0da88e180c3933d6e773245815a Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 29 Jan 2009 17:32:20 -0800 Subject: [PATCH 36/60] drivers/net/skfp: if !capable(CAP_NET_ADMIN): inverted logic Fix inverted logic Signed-off-by: Roel Kluin Signed-off-by: David S. Miller --- drivers/net/skfp/skfddi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c index 607efeaf0bc5..9a00e5566af7 100644 --- a/drivers/net/skfp/skfddi.c +++ b/drivers/net/skfp/skfddi.c @@ -1003,9 +1003,9 @@ static int skfp_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) break; case SKFP_CLR_STATS: /* Zero out the driver statistics */ if (!capable(CAP_NET_ADMIN)) { - memset(&lp->MacStat, 0, sizeof(lp->MacStat)); - } else { status = -EPERM; + } else { + memset(&lp->MacStat, 0, sizeof(lp->MacStat)); } break; default: From f99ec0649accb581cf3e8fcfeea796e82d05f4ea Mon Sep 17 00:00:00 2001 From: Philippe De Muyter Date: Thu, 29 Jan 2009 17:35:04 -0800 Subject: [PATCH 37/60] tulip: fix 21142 with 10Mbps without negotiation with current kernels, tulip 21142 ethernet controllers fail to connect to a 10Mbps only (i.e. without negotiation-partner) network. It used to work in 2.4 kernels. Fix that. Tested on a 21142 Rev 0x11. Signed-off-by: Philippe De Muyter Signed-off-by: David S. Miller --- drivers/net/tulip/21142.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/net/tulip/21142.c b/drivers/net/tulip/21142.c index 1210fb3748a7..db7d5e11855d 100644 --- a/drivers/net/tulip/21142.c +++ b/drivers/net/tulip/21142.c @@ -9,6 +9,11 @@ Please refer to Documentation/DocBook/tulip-user.{pdf,ps,html} for more information on this driver. + + DC21143 manual "21143 PCI/CardBus 10/100Mb/s Ethernet LAN Controller + Hardware Reference Manual" is currently available at : + http://developer.intel.com/design/network/manuals/278074.htm + Please submit bugs to http://bugzilla.kernel.org/ . */ @@ -32,7 +37,11 @@ void t21142_media_task(struct work_struct *work) int csr12 = ioread32(ioaddr + CSR12); int next_tick = 60*HZ; int new_csr6 = 0; + int csr14 = ioread32(ioaddr + CSR14); + /* CSR12[LS10,LS100] are not reliable during autonegotiation */ + if ((csr14 & 0x80) && (csr12 & 0x7000) != 0x5000) + csr12 |= 6; if (tulip_debug > 2) printk(KERN_INFO"%s: 21143 negotiation status %8.8x, %s.\n", dev->name, csr12, medianame[dev->if_port]); @@ -76,7 +85,7 @@ void t21142_media_task(struct work_struct *work) new_csr6 = 0x83860000; dev->if_port = 3; iowrite32(0, ioaddr + CSR13); - iowrite32(0x0003FF7F, ioaddr + CSR14); + iowrite32(0x0003FFFF, ioaddr + CSR14); iowrite16(8, ioaddr + CSR15); iowrite32(1, ioaddr + CSR13); } @@ -132,10 +141,14 @@ void t21142_lnk_change(struct net_device *dev, int csr5) struct tulip_private *tp = netdev_priv(dev); void __iomem *ioaddr = tp->base_addr; int csr12 = ioread32(ioaddr + CSR12); + int csr14 = ioread32(ioaddr + CSR14); + /* CSR12[LS10,LS100] are not reliable during autonegotiation */ + if ((csr14 & 0x80) && (csr12 & 0x7000) != 0x5000) + csr12 |= 6; if (tulip_debug > 1) printk(KERN_INFO"%s: 21143 link status interrupt %8.8x, CSR5 %x, " - "%8.8x.\n", dev->name, csr12, csr5, ioread32(ioaddr + CSR14)); + "%8.8x.\n", dev->name, csr12, csr5, csr14); /* If NWay finished and we have a negotiated partner capability. */ if (tp->nway && !tp->nwayset && (csr12 & 0x7000) == 0x5000) { @@ -143,7 +156,9 @@ void t21142_lnk_change(struct net_device *dev, int csr5) int negotiated = tp->sym_advertise & (csr12 >> 16); tp->lpar = csr12 >> 16; tp->nwayset = 1; - if (negotiated & 0x0100) dev->if_port = 5; + /* If partner cannot negotiate, it is 10Mbps Half Duplex */ + if (!(csr12 & 0x8000)) dev->if_port = 0; + else if (negotiated & 0x0100) dev->if_port = 5; else if (negotiated & 0x0080) dev->if_port = 3; else if (negotiated & 0x0040) dev->if_port = 4; else if (negotiated & 0x0020) dev->if_port = 0; @@ -214,7 +229,7 @@ void t21142_lnk_change(struct net_device *dev, int csr5) tp->timer.expires = RUN_AT(3*HZ); add_timer(&tp->timer); } else if (dev->if_port == 5) - iowrite32(ioread32(ioaddr + CSR14) & ~0x080, ioaddr + CSR14); + iowrite32(csr14 & ~0x080, ioaddr + CSR14); } else if (dev->if_port == 0 || dev->if_port == 4) { if ((csr12 & 4) == 0) printk(KERN_INFO"%s: 21143 10baseT link beat good.\n", From b9ec63f78b425c0e16cc95605b5d4ff2dc228b97 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 30 Jan 2009 00:00:24 -0500 Subject: [PATCH 38/60] ext4: Remove bogus BUG() check in ext4_bmap() The code to support journal-less ext4 operation added a BUG to ext4_bmap() which fired if there was no journal and the EXT4_STATE_JDATA bit was set in the i_state field. This caused running the filefrag program (which uses the FIMBAP ioctl) to trigger a BUG(). The EXT4_STATE_JDATA bit is only used for ext4_bmap(), and it's harmless for the bit to be set. We could add a check in __ext4_journalled_writepage() and ext4_journalled_write_end() to only set the EXT4_STATE_JDATA bit if the journal is present, but that adds an extra test and jump instruction. It's easier to simply remove the BUG check. http://bugzilla.kernel.org/show_bug.cgi?id=12568 Signed-off-by: "Theodore Ts'o" Cc: stable@kernel.org --- fs/ext4/inode.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b4386dafeb0c..03ba20be1329 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2821,9 +2821,6 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - BUG_ON(!EXT4_JOURNAL(inode) && - EXT4_I(inode)->i_state & EXT4_STATE_JDATA); - if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of From 7b24fc4d7eb611da367dea3aad45473050aacd6c Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Sun, 4 Jan 2009 02:43:38 -0500 Subject: [PATCH 39/60] block: Don't verify integrity metadata on read error If we get an I/O error on a read request there is no point in doing a verify pass on the integrity buffer. Adjust the completion path accordingly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 25 +++++++++++++++---------- include/linux/bio.h | 1 - 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 77ebc3c263d6..8396d741f804 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -465,7 +465,7 @@ static int bio_integrity_verify(struct bio *bio) if (ret) { kunmap_atomic(kaddr, KM_USER0); - break; + return ret; } sectors = bv->bv_len / bi->sector_size; @@ -493,18 +493,13 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - int error = bip->bip_error; + int error; - if (bio_integrity_verify(bio)) { - clear_bit(BIO_UPTODATE, &bio->bi_flags); - error = -EIO; - } + error = bio_integrity_verify(bio); /* Restore original bio completion handler */ bio->bi_end_io = bip->bip_end_io; - - if (bio->bi_end_io) - bio->bi_end_io(bio, error); + bio_endio(bio, error); } /** @@ -525,7 +520,17 @@ void bio_integrity_endio(struct bio *bio, int error) BUG_ON(bip->bip_bio != bio); - bip->bip_error = error; + /* In case of an I/O error there is no point in verifying the + * integrity metadata. Restore original bio end_io handler + * and run it. + */ + if (error) { + bio->bi_end_io = bip->bip_end_io; + bio_endio(bio, error); + + return; + } + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); queue_work(kintegrityd_wq, &bip->bip_work); } diff --git a/include/linux/bio.h b/include/linux/bio.h index 18462c5b8fff..18fc4a281a7b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -312,7 +312,6 @@ struct bio_integrity_payload { void *bip_buf; /* generated integrity data */ bio_end_io_t *bip_end_io; /* saved I/O completion fn */ - int bip_error; /* saved I/O error */ unsigned int bip_size; unsigned short bip_pool; /* pool the ivec came from */ From 8ae372e3bb4acaca37ffa2ce54f4cf8dd60a94fa Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Sun, 4 Jan 2009 02:43:39 -0500 Subject: [PATCH 40/60] block: Remove obsolete BUG_ON Now that bio_vecs are no longer cleared in bvec_alloc_bs() the following BUG_ON must go. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 8396d741f804..549b0144da11 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -140,7 +140,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, iv = bip_vec_idx(bip, bip->bip_vcnt); BUG_ON(iv == NULL); - BUG_ON(iv->bv_page != NULL); iv->bv_page = page; iv->bv_len = len; From 322316385dde5cd879e682bcb598c56d0659fb60 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Sun, 4 Jan 2009 02:43:40 -0500 Subject: [PATCH 41/60] block: Allow empty integrity profile Allow a block device to allocate and register an integrity profile without providing a template. This allows DM to preallocate a profile to avoid deadlocks during table reconfiguration. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-integrity.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 61a8e2f8fdd0..91fa8e06b6a5 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -309,24 +309,24 @@ static struct kobj_type integrity_ktype = { /** * blk_integrity_register - Register a gendisk as being integrity-capable * @disk: struct gendisk pointer to make integrity-aware - * @template: integrity profile + * @template: optional integrity profile to register * * Description: When a device needs to advertise itself as being able * to send/receive integrity metadata it must use this function to * register the capability with the block layer. The template is a * blk_integrity struct with values appropriate for the underlying - * hardware. See Documentation/block/data-integrity.txt. + * hardware. If template is NULL the new profile is allocated but + * not filled out. See Documentation/block/data-integrity.txt. */ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) { struct blk_integrity *bi; BUG_ON(disk == NULL); - BUG_ON(template == NULL); if (disk->integrity == NULL) { bi = kmem_cache_alloc(integrity_cachep, - GFP_KERNEL | __GFP_ZERO); + GFP_KERNEL | __GFP_ZERO); if (!bi) return -1; @@ -346,13 +346,16 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) bi = disk->integrity; /* Use the provided profile as template */ - bi->name = template->name; - bi->generate_fn = template->generate_fn; - bi->verify_fn = template->verify_fn; - bi->tuple_size = template->tuple_size; - bi->set_tag_fn = template->set_tag_fn; - bi->get_tag_fn = template->get_tag_fn; - bi->tag_size = template->tag_size; + if (template != NULL) { + bi->name = template->name; + bi->generate_fn = template->generate_fn; + bi->verify_fn = template->verify_fn; + bi->tuple_size = template->tuple_size; + bi->set_tag_fn = template->set_tag_fn; + bi->get_tag_fn = template->get_tag_fn; + bi->tag_size = template->tag_size; + } else + bi->name = "unsupported"; return 0; } From f48fc4d32e24c0b6a18aad30305d819bcc68c049 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 5 Jan 2009 10:17:25 +0100 Subject: [PATCH 42/60] block: get rid of the manual directory counting in blktrace It can result in a stuck blktrace system, if --kill is used. Signed-off-by: Jens Axboe --- block/blktrace.c | 72 ++++++++++++++---------------------------------- 1 file changed, 21 insertions(+), 51 deletions(-) diff --git a/block/blktrace.c b/block/blktrace.c index b0a2cae886db..39cc3bfe56e4 100644 --- a/block/blktrace.c +++ b/block/blktrace.c @@ -187,59 +187,12 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, static struct dentry *blk_tree_root; static DEFINE_MUTEX(blk_tree_mutex); -static unsigned int root_users; - -static inline void blk_remove_root(void) -{ - if (blk_tree_root) { - debugfs_remove(blk_tree_root); - blk_tree_root = NULL; - } -} - -static void blk_remove_tree(struct dentry *dir) -{ - mutex_lock(&blk_tree_mutex); - debugfs_remove(dir); - if (--root_users == 0) - blk_remove_root(); - mutex_unlock(&blk_tree_mutex); -} - -static struct dentry *blk_create_tree(const char *blk_name) -{ - struct dentry *dir = NULL; - int created = 0; - - mutex_lock(&blk_tree_mutex); - - if (!blk_tree_root) { - blk_tree_root = debugfs_create_dir("block", NULL); - if (!blk_tree_root) - goto err; - created = 1; - } - - dir = debugfs_create_dir(blk_name, blk_tree_root); - if (dir) - root_users++; - else { - /* Delete root only if we created it */ - if (created) - blk_remove_root(); - } - -err: - mutex_unlock(&blk_tree_mutex); - return dir; -} static void blk_trace_cleanup(struct blk_trace *bt) { - relay_close(bt->rchan); debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); - blk_remove_tree(bt->dir); + relay_close(bt->rchan); free_percpu(bt->sequence); free_percpu(bt->msg_data); kfree(bt); @@ -346,7 +299,18 @@ static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf, static int blk_remove_buf_file_callback(struct dentry *dentry) { + struct dentry *parent = dentry->d_parent; debugfs_remove(dentry); + + /* + * this will fail for all but the last file, but that is ok. what we + * care about is the top level buts->name directory going away, when + * the last trace file is gone. Then we don't have to rmdir() that + * manually on trace stop, so it nicely solves the issue with + * force killing of running traces. + */ + + debugfs_remove(parent); return 0; } @@ -404,7 +368,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, goto err; ret = -ENOENT; - dir = blk_create_tree(buts->name); + + if (!blk_tree_root) { + blk_tree_root = debugfs_create_dir("block", NULL); + if (!blk_tree_root) + return -ENOMEM; + } + + dir = debugfs_create_dir(buts->name, blk_tree_root); + if (!dir) goto err; @@ -458,8 +430,6 @@ probe_err: atomic_dec(&blk_probes_ref); mutex_unlock(&blk_probe_mutex); err: - if (dir) - blk_remove_tree(dir); if (bt) { if (bt->msg_file) debugfs_remove(bt->msg_file); From 16642eb68216d8e0e136a99e514e9166e7125838 Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 5 Jan 2009 10:18:53 +0100 Subject: [PATCH 43/60] Fix small typo in bio.h's documentation Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index 18fc4a281a7b..5175aa3103c6 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -144,7 +144,7 @@ struct bio { * bit 1 -- rw-ahead when set * bit 2 -- barrier * Insert a serialization point in the IO queue, forcing previously - * submitted IO to be completed before this oen is issued. + * submitted IO to be completed before this one is issued. * bit 3 -- synchronous I/O hint: the block layer will unplug immediately * Note that this does NOT indicate that the IO itself is sync, just * that the block layer will not postpone issue of this IO by plugging. From 1308835ffffe6d61ad1f48c5c381c9cc47f683ec Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Wed, 7 Jan 2009 12:22:39 +0100 Subject: [PATCH 44/60] block: export SSD/non-rotational queue flag through sysfs For some devices (i.e. CFA ATA) we can't reliably detect whether the device is of rotational or non-rotational type so we need to leave the final decision about this setting to the user-space. As a bonus do a minor CodingStyle fixup in queue_nomerges_store(). Suggested-by: Alan Cox Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a29cb788e408..b538ab8dbbd1 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -130,6 +130,27 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_hw_sectors_kb, (page)); } +static ssize_t queue_nonrot_show(struct request_queue *q, char *page) +{ + return queue_var_show(!blk_queue_nonrot(q), page); +} + +static ssize_t queue_nonrot_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long nm; + ssize_t ret = queue_var_store(&nm, page, count); + + spin_lock_irq(q->queue_lock); + if (nm) + queue_flag_clear(QUEUE_FLAG_NONROT, q); + else + queue_flag_set(QUEUE_FLAG_NONROT, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static ssize_t queue_nomerges_show(struct request_queue *q, char *page) { return queue_var_show(blk_queue_nomerges(q), page); @@ -146,8 +167,8 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, queue_flag_set(QUEUE_FLAG_NOMERGES, q); else queue_flag_clear(QUEUE_FLAG_NOMERGES, q); - spin_unlock_irq(q->queue_lock); + return ret; } @@ -210,6 +231,12 @@ static struct queue_sysfs_entry queue_hw_sector_size_entry = { .show = queue_hw_sector_size_show, }; +static struct queue_sysfs_entry queue_nonrot_entry = { + .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, + .show = queue_nonrot_show, + .store = queue_nonrot_store, +}; + static struct queue_sysfs_entry queue_nomerges_entry = { .attr = {.name = "nomerges", .mode = S_IRUGO | S_IWUSR }, .show = queue_nomerges_show, @@ -229,6 +256,7 @@ static struct attribute *default_attrs[] = { &queue_max_sectors_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, + &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, NULL, From 213d9417fec62ef4c3675621b9364a667954d4dd Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Jan 2009 09:16:05 +0100 Subject: [PATCH 45/60] block: seperate bio/request unplug and sync bits Signed-off-by: Jens Axboe --- block/blk-core.c | 5 ++++- include/linux/bio.h | 18 +++++++++++------- include/linux/blkdev.h | 2 ++ 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a824e49c0d0a..9e2e86fb78b8 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1125,6 +1125,8 @@ void init_request_from_bio(struct request *req, struct bio *bio) if (bio_sync(bio)) req->cmd_flags |= REQ_RW_SYNC; + if (bio_unplug(bio)) + req->cmd_flags |= REQ_UNPLUG; if (bio_rw_meta(bio)) req->cmd_flags |= REQ_RW_META; @@ -1141,6 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) int el_ret, nr_sectors; const unsigned short prio = bio_prio(bio); const int sync = bio_sync(bio); + const int unplug = bio_unplug(bio); int rw_flags; nr_sectors = bio_sectors(bio); @@ -1244,7 +1247,7 @@ get_rq: blk_plug_device(q); add_request(q, req); out: - if (sync || blk_queue_nonrot(q)) + if (unplug || blk_queue_nonrot(q)) __generic_unplug_device(q); spin_unlock_irq(q->queue_lock); return 0; diff --git a/include/linux/bio.h b/include/linux/bio.h index 5175aa3103c6..f53568c5852e 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -163,12 +163,15 @@ struct bio { #define BIO_RW 0 /* Must match RW in req flags (blkdev.h) */ #define BIO_RW_AHEAD 1 /* Must match FAILFAST in req flags */ #define BIO_RW_BARRIER 2 -#define BIO_RW_SYNC 3 -#define BIO_RW_META 4 -#define BIO_RW_DISCARD 5 -#define BIO_RW_FAILFAST_DEV 6 -#define BIO_RW_FAILFAST_TRANSPORT 7 -#define BIO_RW_FAILFAST_DRIVER 8 +#define BIO_RW_SYNCIO 3 +#define BIO_RW_UNPLUG 4 +#define BIO_RW_META 5 +#define BIO_RW_DISCARD 6 +#define BIO_RW_FAILFAST_DEV 7 +#define BIO_RW_FAILFAST_TRANSPORT 8 +#define BIO_RW_FAILFAST_DRIVER 9 + +#define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) /* * upper 16 bits of bi_rw define the io priority of this bio @@ -194,7 +197,8 @@ struct bio { #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) #define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) -#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNC)) +#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNCIO)) +#define bio_unplug(bio) ((bio)->bi_rw & (1 << BIO_RW_UNPLUG)) #define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) #define bio_failfast_transport(bio) \ ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 044467ef7b11..75426e4b8cdf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -108,6 +108,7 @@ enum rq_flag_bits { __REQ_RW_META, /* metadata io request */ __REQ_COPY_USER, /* contains copies of user pages */ __REQ_INTEGRITY, /* integrity metadata has been remapped */ + __REQ_UNPLUG, /* unplug queue on submission */ __REQ_NR_BITS, /* stops here */ }; @@ -134,6 +135,7 @@ enum rq_flag_bits { #define REQ_RW_META (1 << __REQ_RW_META) #define REQ_COPY_USER (1 << __REQ_COPY_USER) #define REQ_INTEGRITY (1 << __REQ_INTEGRITY) +#define REQ_UNPLUG (1 << __REQ_UNPLUG) #define BLK_MAX_CDB 16 From 1dfa17f4ab8543d82caf4d36636b93916a18f456 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 6 Jan 2009 09:21:49 +0100 Subject: [PATCH 46/60] block: add bio_rw_flagged() for testing bio->bi_rw The existing functions for checking bio->bi_rw are badly named. So lets mirror what we do for bio->bi_flags testing, use a properly named function so that it's immediately obvious what is being tested. Maintain compatability names for the old macros, eventually we'll get rid of these. Signed-off-by: Jens Axboe --- include/linux/bio.h | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/include/linux/bio.h b/include/linux/bio.h index f53568c5852e..0942765cf8c0 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -173,6 +173,24 @@ struct bio { #define BIO_RW_SYNC (BIO_RW_SYNCIO | BIO_RW_UNPLUG) +#define bio_rw_flagged(bio, flag) ((bio)->bi_rw & (1 << (flag))) + +/* + * Old defines, these should eventually be replaced by direct usage of + * bio_rw_flagged() + */ +#define bio_barrier(bio) bio_rw_flagged(bio, BIO_RW_BARRIER) +#define bio_sync(bio) bio_rw_flagged(bio, BIO_RW_SYNCIO) +#define bio_unplug(bio) bio_rw_flagged(bio, BIO_RW_UNPLUG) +#define bio_failfast_dev(bio) bio_rw_flagged(bio, BIO_RW_FAILFAST_DEV) +#define bio_failfast_transport(bio) \ + bio_rw_flagged(bio, BIO_RW_FAILFAST_TRANSPORT) +#define bio_failfast_driver(bio) \ + bio_rw_flagged(bio, BIO_RW_FAILFAST_DRIVER) +#define bio_rw_ahead(bio) bio_rw_flagged(bio, BIO_RW_AHEAD) +#define bio_rw_meta(bio) bio_rw_flagged(bio, BIO_RW_META) +#define bio_discard(bio) bio_rw_flagged(bio, BIO_RW_DISCARD) + /* * upper 16 bits of bi_rw define the io priority of this bio */ @@ -196,16 +214,6 @@ struct bio { #define bio_offset(bio) bio_iovec((bio))->bv_offset #define bio_segments(bio) ((bio)->bi_vcnt - (bio)->bi_idx) #define bio_sectors(bio) ((bio)->bi_size >> 9) -#define bio_barrier(bio) ((bio)->bi_rw & (1 << BIO_RW_BARRIER)) -#define bio_sync(bio) ((bio)->bi_rw & (1 << BIO_RW_SYNCIO)) -#define bio_unplug(bio) ((bio)->bi_rw & (1 << BIO_RW_UNPLUG)) -#define bio_failfast_dev(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DEV)) -#define bio_failfast_transport(bio) \ - ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_TRANSPORT)) -#define bio_failfast_driver(bio) ((bio)->bi_rw & (1 << BIO_RW_FAILFAST_DRIVER)) -#define bio_rw_ahead(bio) ((bio)->bi_rw & (1 << BIO_RW_AHEAD)) -#define bio_rw_meta(bio) ((bio)->bi_rw & (1 << BIO_RW_META)) -#define bio_discard(bio) ((bio)->bi_rw & (1 << BIO_RW_DISCARD)) #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) static inline unsigned int bio_cur_sectors(struct bio *bio) From dbdac9b71dff5d27885f82eb2cfca310861fdf9e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 13 Jan 2009 15:27:32 +0100 Subject: [PATCH 47/60] block: Fix documentation for blkdev_issue_flush() Signed-off-by: "Theodore Ts'o" Signed-off-by: Jens Axboe --- block/blk-barrier.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8eba4e43bb0c..f7dae57e6cab 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -302,7 +302,7 @@ static void bio_end_empty_barrier(struct bio *bio, int err) * Description: * Issue a flush for the block device in question. Caller can supply * room for storing the error offset in case of a flush error, if they - * wish to. Caller must run wait_for_completion() on its own. + * wish to. */ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) { From cec0707e40ae25794b5a2de7b7f03c51961f80d9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 13 Jan 2009 15:28:32 +0100 Subject: [PATCH 48/60] block: silently error an unsupported barrier bio This fixes a "regression" from 2.6.28, where the barrier probes that file systems may do would trigger additional end request warnings in dmesg. Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 9e2e86fb78b8..ae75c047f45d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1451,6 +1451,11 @@ static inline void __generic_make_request(struct bio *bio) err = -EOPNOTSUPP; goto end_io; } + if (bio_barrier(bio) && bio_has_data(bio) && + (q->next_ordered == QUEUE_ORDERED_NONE)) { + err = -EOPNOTSUPP; + goto end_io; + } ret = q->make_request_fn(q, bio); } while (ret); From a229fc61ef0ee3c30fd193beee0eeb87410227f1 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Mon, 19 Jan 2009 10:37:38 +0100 Subject: [PATCH 49/60] include/linux: Add bsg.h to the Kernel exported headers bsg.h in current form is perfectly suitable for user-mode consumption. It is needed together with scsi/sg.h for applications that want to interface with the bsg driver. Currently the few projects that use it would copy it over into the projects. But that is not acceptable for projects that need to provide source and devel packages for distros. This should also be submitted to stable 2.6.28 and 2.6.27 since bsg had a stable API since these Kernels and distro users will need the header for these kernels a swell Signed-off-by: Boaz Harrosh Acked-by: FUJITA Tomonori CC: stable@kernel.org Signed-off-by: Jens Axboe --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/Kbuild b/include/linux/Kbuild index 12e9a2957caf..2124c063a7ef 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -41,6 +41,7 @@ header-y += baycom.h header-y += bfs_fs.h header-y += blkpg.h header-y += bpqether.h +header-y += bsg.h header-y += can.h header-y += cdk.h header-y += chio.h From 7598909e3ee2a08726276d6415b69dadb52d0d76 Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Tue, 27 Jan 2009 09:29:24 +0100 Subject: [PATCH 50/60] Mark mandatory elevator functions in the biodoc.txt biodoc.txt mentions that elevator functions marked with * are mandatory, but no function is marked with *. Mark the 3 functions which should be implemented by any io scheduler. Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- Documentation/block/biodoc.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index 5d2480d33b43..ecad6ee75705 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -954,14 +954,14 @@ elevator_allow_merge_fn called whenever the block layer determines results in some sort of conflict internally, this hook allows it to do that. -elevator_dispatch_fn fills the dispatch queue with ready requests. +elevator_dispatch_fn* fills the dispatch queue with ready requests. I/O schedulers are free to postpone requests by not filling the dispatch queue unless @force is non-zero. Once dispatched, I/O schedulers are not allowed to manipulate the requests - they belong to generic dispatch queue. -elevator_add_req_fn called to add a new request into the scheduler +elevator_add_req_fn* called to add a new request into the scheduler elevator_queue_empty_fn returns true if the merge queue is empty. Drivers shouldn't use this, but rather check @@ -991,7 +991,7 @@ elevator_activate_req_fn Called when device driver first sees a request. elevator_deactivate_req_fn Called when device driver decides to delay a request by requeueing it. -elevator_init_fn +elevator_init_fn* elevator_exit_fn Allocate and free any elevator specific storage for a queue. From bc58ba9468d94d62c56ab9b47173583ec140b165 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 23 Jan 2009 10:54:44 +0100 Subject: [PATCH 51/60] block: add sysfs file for controlling io stats accounting This allows us to turn off disk stat accounting completely, for the cases where the 0.5-1% reduction in system time is important. Signed-off-by: Jens Axboe --- block/blk-core.c | 90 +++++++++++++++++++++++++----------------- block/blk-sysfs.c | 28 +++++++++++++ include/linux/blkdev.h | 6 +++ 3 files changed, 88 insertions(+), 36 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index ae75c047f45d..ca69f3d94100 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -64,11 +64,12 @@ static struct workqueue_struct *kblockd_workqueue; static void drive_stat_acct(struct request *rq, int new_io) { + struct gendisk *disk = rq->rq_disk; struct hd_struct *part; int rw = rq_data_dir(rq); int cpu; - if (!blk_fs_request(rq) || !rq->rq_disk) + if (!blk_fs_request(rq) || !disk || !blk_queue_io_stat(disk->queue)) return; cpu = part_stat_lock(); @@ -599,8 +600,7 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) q->request_fn = rfn; q->prep_rq_fn = NULL; q->unplug_fn = generic_unplug_device; - q->queue_flags = (1 << QUEUE_FLAG_CLUSTER | - 1 << QUEUE_FLAG_STACKABLE); + q->queue_flags = QUEUE_FLAG_DEFAULT; q->queue_lock = lock; blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK); @@ -1663,6 +1663,55 @@ void blkdev_dequeue_request(struct request *req) } EXPORT_SYMBOL(blkdev_dequeue_request); +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_queue_io_stat(disk->queue)) + return; + + if (blk_fs_request(req)) { + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(req->rq_disk, req->sector); + part_stat_add(cpu, part, sectors[rw], bytes >> 9); + part_stat_unlock(); + } +} + +static void blk_account_io_done(struct request *req) +{ + struct gendisk *disk = req->rq_disk; + + if (!disk || !blk_queue_io_stat(disk->queue)) + return; + + /* + * Account IO completion. bar_rq isn't accounted as a normal + * IO on queueing nor completion. Accounting the containing + * request is enough. + */ + if (blk_fs_request(req) && req != &req->q->bar_rq) { + unsigned long duration = jiffies - req->start_time; + const int rw = rq_data_dir(req); + struct hd_struct *part; + int cpu; + + cpu = part_stat_lock(); + part = disk_map_sector_rcu(disk, req->sector); + + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part); + + part_stat_unlock(); + } +} + /** * __end_that_request_first - end I/O on a request * @req: the request being processed @@ -1698,16 +1747,7 @@ static int __end_that_request_first(struct request *req, int error, (unsigned long long)req->sector); } - if (blk_fs_request(req) && req->rq_disk) { - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); - part_stat_add(cpu, part, sectors[rw], nr_bytes >> 9); - part_stat_unlock(); - } + blk_account_io_completion(req, nr_bytes); total_bytes = bio_nbytes = 0; while ((bio = req->bio) != NULL) { @@ -1787,8 +1827,6 @@ static int __end_that_request_first(struct request *req, int error, */ static void end_that_request_last(struct request *req, int error) { - struct gendisk *disk = req->rq_disk; - if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -1800,27 +1838,7 @@ static void end_that_request_last(struct request *req, int error) blk_delete_timer(req); - /* - * Account IO completion. bar_rq isn't accounted as a normal - * IO on queueing nor completion. Accounting the containing - * request is enough. - */ - if (disk && blk_fs_request(req) && req != &req->q->bar_rq) { - unsigned long duration = jiffies - req->start_time; - const int rw = rq_data_dir(req); - struct hd_struct *part; - int cpu; - - cpu = part_stat_lock(); - part = disk_map_sector_rcu(disk, req->sector); - - part_stat_inc(cpu, part, ios[rw]); - part_stat_add(cpu, part, ticks[rw], duration); - part_round_stats(cpu, part); - part_dec_in_flight(part); - - part_stat_unlock(); - } + blk_account_io_done(req); if (req->end_io) req->end_io(req, error); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index b538ab8dbbd1..e29ddfc73cf4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -197,6 +197,27 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) return ret; } +static ssize_t queue_iostats_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_io_stat(q), page); +} + +static ssize_t queue_iostats_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long stats; + ssize_t ret = queue_var_store(&stats, page, count); + + spin_lock_irq(q->queue_lock); + if (stats) + queue_flag_set(QUEUE_FLAG_IO_STAT, q); + else + queue_flag_clear(QUEUE_FLAG_IO_STAT, q); + spin_unlock_irq(q->queue_lock); + + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -249,6 +270,12 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = { .store = queue_rq_affinity_store, }; +static struct queue_sysfs_entry queue_iostats_entry = { + .attr = {.name = "iostats", .mode = S_IRUGO | S_IWUSR }, + .show = queue_iostats_show, + .store = queue_iostats_store, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -259,6 +286,7 @@ static struct attribute *default_attrs[] = { &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, + &queue_iostats_entry.attr, NULL, }; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 75426e4b8cdf..d08c4b8219a6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -451,6 +451,11 @@ struct request_queue #define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ +#define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ + +#define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ + (1 << QUEUE_FLAG_CLUSTER) | \ + 1 << QUEUE_FLAG_STACKABLE) static inline int queue_is_locked(struct request_queue *q) { @@ -567,6 +572,7 @@ enum { #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) +#define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) From 3a9a3f6cc55418dd1525e636dccbbe13c394f652 Mon Sep 17 00:00:00 2001 From: Divyesh Shah Date: Fri, 30 Jan 2009 12:46:41 +0100 Subject: [PATCH 52/60] cfq-iosched: Allow RT requests to pre-empt ongoing BE timeslice This patch adds the ability to pre-empt an ongoing BE timeslice when a RT request is waiting for the current timeslice to complete. This reduces the wait time to disk for RT requests from an upper bound of 4 (current value of cfq_quantum) to 1 disk request. Applied Jens' suggeested changes to avoid the rb lookup and use !cfq_class_rt() and retested. Latency(secs) for the RT task when doing sequential reads from 10G file. | only RT | RT + BE | RT + BE + this patch small (512 byte) reads | 143 | 163 | 145 large (1Mb) reads | 142 | 158 | 146 Signed-off-by: Divyesh Shah Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index e8525fa72823..664ebfd092ec 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -84,6 +84,11 @@ struct cfq_data { */ struct cfq_rb_root service_tree; unsigned int busy_queues; + /* + * Used to track any pending rt requests so we can pre-empt current + * non-RT cfqq in service when this value is non-zero. + */ + unsigned int busy_rt_queues; int rq_in_driver; int sync_flight; @@ -562,6 +567,8 @@ static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(cfq_cfqq_on_rr(cfqq)); cfq_mark_cfqq_on_rr(cfqq); cfqd->busy_queues++; + if (cfq_class_rt(cfqq)) + cfqd->busy_rt_queues++; cfq_resort_rr_list(cfqd, cfqq); } @@ -581,6 +588,8 @@ static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(!cfqd->busy_queues); cfqd->busy_queues--; + if (cfq_class_rt(cfqq)) + cfqd->busy_rt_queues--; } /* @@ -1004,6 +1013,20 @@ static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) if (cfq_slice_used(cfqq)) goto expire; + /* + * If we have a RT cfqq waiting, then we pre-empt the current non-rt + * cfqq. + */ + if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) { + /* + * We simulate this as cfqq timed out so that it gets to bank + * the remaining of its time slice. + */ + cfq_log_cfqq(cfqd, cfqq, "preempt"); + cfq_slice_expired(cfqd, 1); + goto new_queue; + } + /* * The active queue has requests and isn't expired, allow it to * dispatch. @@ -1067,6 +1090,13 @@ __cfq_dispatch_requests(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (RB_EMPTY_ROOT(&cfqq->sort_list)) break; + /* + * If there is a non-empty RT cfqq waiting for current + * cfqq's timeslice to complete, pre-empt this cfqq + */ + if (!cfq_class_rt(cfqq) && cfqd->busy_rt_queues) + break; + } while (dispatched < max_dispatch); /* @@ -1801,6 +1831,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (rq_is_meta(rq) && !cfqq->meta_pending) return 1; + /* + * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. + */ + if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) + return 1; + if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) return 0; @@ -1870,7 +1906,8 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, /* * not the active queue - expire current slice if it is * idle and has expired it's mean thinktime or this new queue - * has some old slice time left and is of higher priority + * has some old slice time left and is of higher priority or + * this new queue is RT and the current one is BE */ cfq_preempt_queue(cfqd, cfqq); cfq_mark_cfqq_must_dispatch(cfqq); From 0461ec5bc7745b89a8ab67ba0ea497abd58a6301 Mon Sep 17 00:00:00 2001 From: Paul Larson Date: Fri, 30 Jan 2009 10:21:49 -0600 Subject: [PATCH 53/60] Add enable_ms to jsm driver This fixes a crash observed when non-existant enable_ms function is called for jsm driver. Signed-off-by: Scott Kilau Signed-off-by: Paul Larson Signed-off-by: Linus Torvalds --- drivers/serial/jsm/jsm_tty.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/serial/jsm/jsm_tty.c b/drivers/serial/jsm/jsm_tty.c index 3547558d2caf..324c74d2f666 100644 --- a/drivers/serial/jsm/jsm_tty.c +++ b/drivers/serial/jsm/jsm_tty.c @@ -161,6 +161,11 @@ static void jsm_tty_stop_rx(struct uart_port *port) channel->ch_bd->bd_ops->disable_receiver(channel); } +static void jsm_tty_enable_ms(struct uart_port *port) +{ + /* Nothing needed */ +} + static void jsm_tty_break(struct uart_port *port, int break_state) { unsigned long lock_flags; @@ -345,6 +350,7 @@ static struct uart_ops jsm_ops = { .start_tx = jsm_tty_start_tx, .send_xchar = jsm_tty_send_xchar, .stop_rx = jsm_tty_stop_rx, + .enable_ms = jsm_tty_enable_ms, .break_ctl = jsm_tty_break, .startup = jsm_tty_open, .shutdown = jsm_tty_close, From 33bfad54b58cf05cfe6678c3ec9235d4bc8db4c2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 30 Jan 2009 11:37:22 -0800 Subject: [PATCH 54/60] Allow opportunistic merging of VM_CAN_NONLINEAR areas Commit de33c8db5910cda599899dd431cc30d7c1018cbf ("Fix OOPS in mmap_region() when merging adjacent VM_LOCKED file segments") unified the vma merging of anonymous and file maps to just one place, which simplified the code and fixed a use-after-free bug that could cause an oops. But by doing the merge opportunistically before even having called ->mmap() on the file method, it now compares two different 'vm_flags' values: the pre-mmap() value of the new not-yet-formed vma, and previous mappings of the same file around it. And in doing so, it refused to merge the common file case, which adds a marker to say "I can be made non-linear". This fixes it by just adding a set of flags that don't have to match, because we know they are ok to merge. Currently it's only that single VM_CAN_NONLINEAR flag, but at least conceptually there could be others in the future. Reported-and-acked-by: Hugh Dickins Cc: Lee Schermerhorn Cc: Nick Piggin Cc: Andrew Morton Cc: Greg KH Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index d3fa10a726cf..c581df14d0de 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -658,6 +658,9 @@ again: remove_next = 1 + (end > next->vm_end); validate_mm(mm); } +/* Flags that can be inherited from an existing mapping when merging */ +#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) + /* * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. @@ -665,7 +668,7 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if (vma->vm_flags != vm_flags) + if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) return 0; if (vma->vm_file != file) return 0; From 3ac6cffea4aa18007a454a7442da2855882f403d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 30 Jan 2009 16:32:22 +0900 Subject: [PATCH 55/60] linker script: use separate simpler definition for PERCPU() Impact: fix linker screwup on x86_32 Recent x86_64 zerobased patches introduced PERCPU_VADDR() to put .data.percpu to a predefined address and re-defined PERCPU() in terms of it. The new macro defined one extra symbol, __per_cpu_load, for LMA of the section so that the init data could be accessed. This new symbol introduced the following problems to x86_32. 1. If __per_cpu_load is defined outside of .data.percpu as an absolute symbol, relocation generation for relocatable kernel fails due to absolute relocation. 2. If __per_cpu_load is put inside .data.percpu with absolute address assignment to work around #1, linker gets confused and under certain configurations ends up relocating the symbol against .data.percpu such that the load address gets added on top of already set load address. As x86_32 doesn't use predefined address for .data.percpu, there's no need for it to care about the possibility of __per_cpu_load being different from __per_cpu_start. This patch defines PERCPU() separately so that __per_cpu_load is defined inside .data.percpu so that everything is ordinary linking-wise. Signed-off-by: Tejun Heo Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 53e21f36a802..5406e70aba86 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -445,10 +445,9 @@ * section in the linker script will go there too. @phdr should have * a leading colon. * - * This macro defines three symbols, __per_cpu_load, __per_cpu_start - * and __per_cpu_end. The first one is the vaddr of loaded percpu - * init data. __per_cpu_start equals @vaddr and __per_cpu_end is the - * end offset. + * Note that this macros defines __per_cpu_load as an absolute symbol. + * If there is no need to put the percpu section at a predetermined + * address, use PERCPU(). */ #define PERCPU_VADDR(vaddr, phdr) \ VMLINUX_SYMBOL(__per_cpu_load) = .; \ @@ -470,7 +469,20 @@ * Align to @align and outputs output section for percpu area. This * macro doesn't maniuplate @vaddr or @phdr and __per_cpu_load and * __per_cpu_start will be identical. + * + * This macro is equivalent to ALIGN(align); PERCPU_VADDR( , ) except + * that __per_cpu_load is defined as a relative symbol against + * .data.percpu which is required for relocatable x86_32 + * configuration. */ #define PERCPU(align) \ . = ALIGN(align); \ - PERCPU_VADDR( , ) + .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { \ + VMLINUX_SYMBOL(__per_cpu_load) = .; \ + VMLINUX_SYMBOL(__per_cpu_start) = .; \ + *(.data.percpu.first) \ + *(.data.percpu.page_aligned) \ + *(.data.percpu) \ + *(.data.percpu.shared_aligned) \ + VMLINUX_SYMBOL(__per_cpu_end) = .; \ + } From 2749ebe320ff9f77548d10fcc0a3464ac21c8e58 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Thu, 29 Jan 2009 15:35:26 -0600 Subject: [PATCH 56/60] x86: UV fix uv_flush_send_and_wait() Impact: fix possible tlb mis-flushing on UV uv_flush_send_and_wait() should return a pointer if the broadcast remote tlb shootdown requests fail. That causes the conventional IPI method of shootdown to be used. Signed-off-by: Cliff Wickman Signed-off-by: Tejun Heo --- arch/x86/kernel/tlb_uv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 89fce1b6d01f..f4b2f27d19b9 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -259,7 +259,7 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_blade, * the cpu's, all of which are still in the mask. */ __get_cpu_var(ptcstats).ptc_i++; - return 0; + return flush_mask; } /* From 552be871e67ff577ed36beb2f53d078b42304739 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 30 Jan 2009 17:47:53 +0900 Subject: [PATCH 57/60] x86: pass in cpu number to switch_to_new_gdt() Impact: cleanup, prepare for xen boot fix. Xen needs to call this function very early to setup the GDT and per-cpu segments. Remove the call to smp_processor_id() and just pass in the cpu number. Signed-off-by: Brian Gerst Signed-off-by: Tejun Heo --- arch/x86/include/asm/processor.h | 2 +- arch/x86/kernel/cpu/common.c | 7 +++---- arch/x86/kernel/setup_percpu.c | 2 +- arch/x86/kernel/smpboot.c | 2 +- arch/x86/mach-voyager/voyager_smp.c | 11 ++++++----- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index befa20b4a68c..1c25eb69ea86 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -768,7 +768,7 @@ extern int sysenter_setup(void); extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); -extern void switch_to_new_gdt(void); +extern void switch_to_new_gdt(int); extern void cpu_init(void); static inline unsigned long get_debugctlmsr(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 652fdc9a757a..6eacd64b602e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -255,10 +255,9 @@ __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; /* Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ -void switch_to_new_gdt(void) +void switch_to_new_gdt(int cpu) { struct desc_ptr gdt_descr; - int cpu = smp_processor_id(); gdt_descr.address = (long)get_cpu_gdt_table(cpu); gdt_descr.size = GDT_SIZE - 1; @@ -993,7 +992,7 @@ void __cpuinit cpu_init(void) * and set up the GDT descriptor: */ - switch_to_new_gdt(); + switch_to_new_gdt(cpu); loadsegment(fs, 0); load_idt((const struct desc_ptr *)&idt_descr); @@ -1098,7 +1097,7 @@ void __cpuinit cpu_init(void) clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); load_idt(&idt_descr); - switch_to_new_gdt(); + switch_to_new_gdt(cpu); /* * Set up and load the per-CPU TSS and LDT diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 0d1e7ac439f4..ef91747bbed5 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -122,7 +122,7 @@ void __init setup_per_cpu_areas(void) * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) - switch_to_new_gdt(); + switch_to_new_gdt(cpu); DBG("PERCPU: cpu %4d %p\n", cpu, ptr); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index f9dbcff43546..612d3c74f6a3 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1185,7 +1185,7 @@ out: void __init native_smp_prepare_boot_cpu(void) { int me = smp_processor_id(); - switch_to_new_gdt(); + switch_to_new_gdt(me); /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); per_cpu(cpu_state, me) = CPU_ONLINE; diff --git a/arch/x86/mach-voyager/voyager_smp.c b/arch/x86/mach-voyager/voyager_smp.c index 331cd6d56483..58c7cac3440d 100644 --- a/arch/x86/mach-voyager/voyager_smp.c +++ b/arch/x86/mach-voyager/voyager_smp.c @@ -1746,12 +1746,13 @@ static void __init voyager_smp_prepare_cpus(unsigned int max_cpus) static void __cpuinit voyager_smp_prepare_boot_cpu(void) { - switch_to_new_gdt(); + int cpu = smp_processor_id(); + switch_to_new_gdt(cpu); - cpu_set(smp_processor_id(), cpu_online_map); - cpu_set(smp_processor_id(), cpu_callout_map); - cpu_set(smp_processor_id(), cpu_possible_map); - cpu_set(smp_processor_id(), cpu_present_map); + cpu_set(cpu, cpu_online_map); + cpu_set(cpu, cpu_callout_map); + cpu_set(cpu, cpu_possible_map); + cpu_set(cpu, cpu_present_map); } static int __cpuinit voyager_cpu_up(unsigned int cpu) From 11e3a840cd5b731cdd8f6f956dfae78a8046d09c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Jan 2009 17:47:54 +0900 Subject: [PATCH 58/60] x86: split loading percpu segments from loading gdt Impact: split out a function, no functional change Xen needs to be able to access percpu data from very early on. For various reasons, it cannot also load the gdt at that time. It does, however, have a pefectly functional gdt at that point, so there's no pressing need to reload the gdt. Split the function to load the segment registers off, so Xen can call it directly. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Tejun Heo --- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/cpu/common.c | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1c25eb69ea86..656d02ea509b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -769,6 +769,7 @@ extern struct desc_ptr early_gdt_descr; extern void cpu_set_gdt(int); extern void switch_to_new_gdt(int); +extern void load_percpu_segment(int); extern void cpu_init(void); static inline unsigned long get_debugctlmsr(void) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6eacd64b602e..0f73ea423089 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -253,6 +253,16 @@ static char __cpuinit *table_lookup_model(struct cpuinfo_x86 *c) __u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata; +void load_percpu_segment(int cpu) +{ +#ifdef CONFIG_X86_32 + loadsegment(fs, __KERNEL_PERCPU); +#else + loadsegment(gs, 0); + wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); +#endif +} + /* Current gdt points %fs at the "master" per-cpu area: after this, * it's on the real one. */ void switch_to_new_gdt(int cpu) @@ -263,12 +273,8 @@ void switch_to_new_gdt(int cpu) gdt_descr.size = GDT_SIZE - 1; load_gdt(&gdt_descr); /* Reload the per-cpu base */ -#ifdef CONFIG_X86_32 - loadsegment(fs, __KERNEL_PERCPU); -#else - loadsegment(gs, 0); - wrmsrl(MSR_GS_BASE, (unsigned long)per_cpu(irq_stack_union.gs_base, cpu)); -#endif + + load_percpu_segment(cpu); } static struct cpu_dev *cpu_devs[X86_VENDOR_NUM] = {}; From 795f99b61d20c34cb04d17d8906b32f745a635ec Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 30 Jan 2009 17:47:54 +0900 Subject: [PATCH 59/60] xen: setup percpu data pointers Impact: fix xen booting We need to access percpu data fairly early, so set up the percpu registers as soon as possible. We only need to load the appropriate segment register. We already have a GDT, but its hard to change it early because we need to manipulate the pagetable to do so, and that hasn't been set up yet. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Tejun Heo --- arch/x86/xen/enlighten.c | 3 +++ arch/x86/xen/smp.c | 5 ++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bef941f61451..fe19c88a5029 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1647,6 +1647,9 @@ asmlinkage void __init xen_start_kernel(void) have_vcpu_info_placement = 0; #endif + /* setup percpu state */ + load_percpu_segment(0); + xen_smp_init(); /* Get mfn list */ diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 7735e3dd359c..88d5d5ec6beb 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -170,7 +170,8 @@ static void __init xen_smp_prepare_boot_cpu(void) /* We've switched to the "real" per-cpu gdt, so make sure the old memory can be recycled */ - make_lowmem_page_readwrite(&per_cpu_var(gdt_page)); + make_lowmem_page_readwrite(__per_cpu_load + + (unsigned long)&per_cpu_var(gdt_page)); xen_setup_vcpu_info_placement(); } @@ -235,6 +236,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle) ctxt->user_regs.ss = __KERNEL_DS; #ifdef CONFIG_X86_32 ctxt->user_regs.fs = __KERNEL_PERCPU; +#else + ctxt->gs_base_kernel = per_cpu_offset(cpu); #endif ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle; ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */ From ef3892bd63420380d115f755d351d2071f1f805f Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 2 Feb 2009 18:16:19 -0800 Subject: [PATCH 60/60] x86, percpu: fix kexec with vmlinux Impact: fix regression with kexec with vmlinux Split data.init into data.init, percpu, data.init2 sections instead of let data.init wrap percpu secion. Thus kexec loading will be happy, because sections will not overlap. Before the patch we have: Elf file type is EXEC (Executable file) Entry point 0x200000 There are 6 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000200000 0xffffffff80200000 0x0000000000200000 0x0000000000ca6000 0x0000000000ca6000 R E 200000 LOAD 0x0000000000ea6000 0xffffffff80ea6000 0x0000000000ea6000 0x000000000014dfe0 0x000000000014dfe0 RWE 200000 LOAD 0x0000000001000000 0xffffffffff600000 0x0000000000ff4000 0x0000000000000888 0x0000000000000888 RWE 200000 LOAD 0x00000000011f6000 0xffffffff80ff6000 0x0000000000ff6000 0x0000000000073086 0x0000000000a2d938 RWE 200000 LOAD 0x0000000001400000 0x0000000000000000 0x000000000106a000 0x00000000001d2ce0 0x00000000001d2ce0 RWE 200000 NOTE 0x00000000009e2c1c 0xffffffff809e2c1c 0x00000000009e2c1c 0x0000000000000024 0x0000000000000024 4 Section to Segment mapping: Segment Sections... 00 .text .notes __ex_table .rodata __bug_table .pci_fixup .builtin_fw __ksymtab __ksymtab_gpl __ksymtab_strings __init_rodata __param 01 .data .init.rodata .data.cacheline_aligned .data.read_mostly 02 .vsyscall_0 .vsyscall_fn .vsyscall_gtod_data .vsyscall_1 .vsyscall_2 .vgetcpu_mode .jiffies 03 .data.init_task .smp_locks .init.text .init.data .init.setup .initcall.init .con_initcall.init .x86_cpu_dev.init .altinstructions .altinstr_replacement .exit.text .init.ramfs .bss 04 .data.percpu 05 .notes After patch we've got: Elf file type is EXEC (Executable file) Entry point 0x200000 There are 7 program headers, starting at offset 64 Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align LOAD 0x0000000000200000 0xffffffff80200000 0x0000000000200000 0x0000000000ca6000 0x0000000000ca6000 R E 200000 LOAD 0x0000000000ea6000 0xffffffff80ea6000 0x0000000000ea6000 0x000000000014dfe0 0x000000000014dfe0 RWE 200000 LOAD 0x0000000001000000 0xffffffffff600000 0x0000000000ff4000 0x0000000000000888 0x0000000000000888 RWE 200000 LOAD 0x00000000011f6000 0xffffffff80ff6000 0x0000000000ff6000 0x0000000000073086 0x0000000000073086 RWE 200000 LOAD 0x0000000001400000 0x0000000000000000 0x000000000106a000 0x00000000001d2ce0 0x00000000001d2ce0 RWE 200000 LOAD 0x000000000163d000 0xffffffff8123d000 0x000000000123d000 0x0000000000000000 0x00000000007e6938 RWE 200000 NOTE 0x00000000009e2c1c 0xffffffff809e2c1c 0x00000000009e2c1c 0x0000000000000024 0x0000000000000024 4 Section to Segment mapping: Segment Sections... 00 .text .notes __ex_table .rodata __bug_table .pci_fixup .builtin_fw __ksymtab __ksymtab_gpl __ksymtab_strings __init_rodata __param 01 .data .init.rodata .data.cacheline_aligned .data.read_mostly 02 .vsyscall_0 .vsyscall_fn .vsyscall_gtod_data .vsyscall_1 .vsyscall_2 .vgetcpu_mode .jiffies 03 .data.init_task .smp_locks .init.text .init.data .init.setup .initcall.init .con_initcall.init .x86_cpu_dev.init .altinstructions .altinstr_replacement .exit.text .init.ramfs 04 .data.percpu 05 .bss 06 .notes Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index c9740996430a..07f62d287ff0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -22,6 +22,7 @@ PHDRS { #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS @@ -215,7 +216,7 @@ SECTIONS /* * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the * output PHDR, so the next output section - __data_nosave - should - * switch it back to data.init. Also, pda should be at the head of + * start another section data.init2. Also, pda should be at the head of * percpu area. Preallocate it and define the percpu offset symbol * so that it can be accessed as a percpu variable. */ @@ -232,7 +233,7 @@ SECTIONS __nosave_begin = .; .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { *(.data.nosave) - } :data.init /* switch back to data.init, see PERCPU_VADDR() above */ + } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ . = ALIGN(PAGE_SIZE); __nosave_end = .;