From 766ea8cce007e699679109df4fa469b870ba4860 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Aug 2006 15:49:53 -0700 Subject: [PATCH 01/12] [NET]: Fix alloc_skb comment typo Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- include/linux/skbuff.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 19c96d498e20..3573ba9a2555 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1081,7 +1081,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * - * %NULL is returned in there is no free memory. + * %NULL is returned if there is no free memory. */ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, gfp_t gfp_mask) @@ -1101,7 +1101,7 @@ static inline struct sk_buff *__dev_alloc_skb(unsigned int length, * the headroom they think they need without accounting for the * built in space. The built in space is used for optimisations. * - * %NULL is returned in there is no free memory. Although this function + * %NULL is returned if there is no free memory. Although this function * allocates memory it can be called from an interrupt. */ static inline struct sk_buff *dev_alloc_skb(unsigned int length) From 7b2e497a06c0e93719fda88820e057b635e8fae2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Aug 2006 16:09:04 -0700 Subject: [PATCH 02/12] [NET]: Assign skb->dev in netdev_alloc_skb Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/core/skbuff.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 022d8894c11d..c54f3664bce5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -268,8 +268,10 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, struct sk_buff *skb; skb = alloc_skb(length + NET_SKB_PAD, gfp_mask); - if (likely(skb)) + if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); + skb->dev = dev; + } return skb; } From d14cc9a342a8004b0ecfe66f1f12120962b61d8c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 7 Aug 2006 16:11:48 -0700 Subject: [PATCH 03/12] [TG3]: skb->dev assignment is done by netdev_alloc_skb All caller of netdev_alloc_skb need to assign skb->dev shortly afterwards. Move it into common code. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- drivers/net/tg3.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 6f97962dd06b..0afbed6753fa 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3101,7 +3101,6 @@ static int tg3_alloc_rx_skb(struct tg3 *tp, u32 opaque_key, if (skb == NULL) return -ENOMEM; - skb->dev = tp->dev; skb_reserve(skb, tp->rx_offset); mapping = pci_map_single(tp->pdev, skb->data, @@ -3274,7 +3273,6 @@ static int tg3_rx(struct tg3 *tp, int budget) if (copy_skb == NULL) goto drop_it_no_recycle; - copy_skb->dev = tp->dev; skb_reserve(copy_skb, 2); skb_put(copy_skb, len); pci_dma_sync_single_for_cpu(tp->pdev, dma_addr, len, PCI_DMA_FROMDEVICE); From 8b5cc5ef40c83c6ea4c90b203bb2c8b17edfa11b Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 7 Aug 2006 20:09:20 -0700 Subject: [PATCH 04/12] [IPX]: Header length validation needed This patch will linearize and check there is enough data. It handles the pprop case as well as avoiding a whole audit of the routing code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index aa34ff4b707c..c13e86b14f69 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1646,7 +1646,8 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty ipx_pktsize = ntohs(ipx->ipx_pktsize); /* Too small or invalid header? */ - if (ipx_pktsize < sizeof(struct ipxhdr) || ipx_pktsize > skb->len) + if (ipx_pktsize < sizeof(struct ipxhdr) || + !pskb_may_pull(skb, ipx_pktsize)) goto drop; if (ipx->ipx_checksum != IPX_NO_CHECKSUM && From 8d1502de27c46b365b5c86e17d173083d3d6c9ac Mon Sep 17 00:00:00 2001 From: Kirill Korotaev Date: Mon, 7 Aug 2006 20:44:22 -0700 Subject: [PATCH 05/12] [IPV4]: Limit rt cache size properly. From: Kirill Korotaev During OpenVZ stress testing we found that UDP traffic with random src can generate too much excessive rt hash growing leading finally to OOM and kernel panics. It was found that for 4GB i686 system (having 1048576 total pages and 225280 normal zone pages) kernel allocates the following route hash: syslog: IP route cache hash table entries: 262144 (order: 8, 1048576 bytes) => ip_rt_max_size = 4194304 entries, i.e. max rt size is 4194304 * 256b = 1Gb of RAM > normal_zone Attached the patch which removes HASH_HIGHMEM flag from alloc_large_system_hash() call. Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 19bd49d69d9f..b873cbcdd0b8 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3157,7 +3157,7 @@ int __init ip_rt_init(void) rhash_entries, (num_physpages >= 128 * 1024) ? 15 : 17, - HASH_HIGHMEM, + 0, &rt_hash_log, &rt_hash_mask, 0); From aaf580601ff244df82324fff380ed6740f27ef03 Mon Sep 17 00:00:00 2001 From: Chen-Li Tien Date: Mon, 7 Aug 2006 20:49:07 -0700 Subject: [PATCH 06/12] [PKTGEN]: Fix oops when used with balance-tlb bonding Signed-off-by: Chen-Li Tien Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 67ed14ddabd2..b1743375eb6d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2149,6 +2149,8 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev, skb->mac.raw = ((u8 *) iph) - 14 - pkt_dev->nr_labels*sizeof(u32); skb->dev = odev; skb->pkt_type = PACKET_HOST; + skb->nh.iph = iph; + skb->h.uh = udph; if (pkt_dev->nfrags <= 0) pgh = (struct pktgen_hdr *)skb_put(skb, datalen); From 69d8c28c9578ce78b3dc1b9be36926d962282898 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 7 Aug 2006 20:52:10 -0700 Subject: [PATCH 07/12] [PKTGEN]: Make sure skb->{nh,h} are initialized in fill_packet_ipv6() too. Mirror the bug fix from fill_packet_ipv4() Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index b1743375eb6d..6a7320b39ed0 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2462,6 +2462,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev, skb->protocol = protocol; skb->dev = odev; skb->pkt_type = PACKET_HOST; + skb->nh.ipv6h = iph; + skb->h.uh = udph; if (pkt_dev->nfrags <= 0) pgh = (struct pktgen_hdr *)skb_put(skb, datalen); From bd37a088596ccdb2b2dd3299e25e333bca7a9a34 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Mon, 7 Aug 2006 21:04:15 -0700 Subject: [PATCH 08/12] [TCP]: SNMPv2 tcpOutSegs counter error Do not count retransmitted segments. Signed-off-by: Wei Yongjun Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 5c08ea20a18d..507adefbc17c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -466,7 +466,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, if (skb->len != tcp_header_size) tcp_event_data_sent(tp, skb, sk); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) + TCP_INC_STATS(TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -2157,10 +2158,9 @@ int tcp_connect(struct sock *sk) skb_shinfo(buff)->gso_size = 0; skb_shinfo(buff)->gso_type = 0; buff->csum = 0; + tp->snd_nxt = tp->write_seq; TCP_SKB_CB(buff)->seq = tp->write_seq++; TCP_SKB_CB(buff)->end_seq = tp->write_seq; - tp->snd_nxt = tp->write_seq; - tp->pushed_seq = tp->write_seq; /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; @@ -2170,6 +2170,12 @@ int tcp_connect(struct sock *sk) sk_charge_skb(sk, buff); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); + + /* We change tp->snd_nxt after the tcp_transmit_skb() call + * in order to make this packet get counted in tcpOutSegs. + */ + tp->snd_nxt = tp->write_seq; + tp->pushed_seq = tp->write_seq; TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ From 1b2a720506ccf7c30baaeda5d990c29b31e21726 Mon Sep 17 00:00:00 2001 From: Michael Chan Date: Mon, 7 Aug 2006 21:46:02 -0700 Subject: [PATCH 09/12] [TG3]: Fix tx race condition Fix a subtle race condition between tg3_start_xmit() and tg3_tx() discovered by Herbert Xu : CPU0 CPU1 tg3_start_xmit() if (tx_ring_full) { tx_lock tg3_tx() if (!netif_queue_stopped) netif_stop_queue() if (!tx_ring_full) update_tx_ring netif_wake_queue() tx_unlock } Even though tx_ring is updated before the if statement in tg3_tx() in program order, it can be re-ordered by the CPU as shown above. This scenario can cause the tx queue to be stopped forever if tg3_tx() has just freed up the entire tx_ring. The possibility of this happening should be very rare though. The following changes are made: 1. Add memory barrier to fix the above race condition. 2. Eliminate the private tx_lock altogether and rely solely on netif_tx_lock. This eliminates one spinlock in tg3_start_xmit() when the ring is full. 3. Because of 2, use netif_tx_lock in tg3_tx() before calling netif_wake_queue(). 4. Change TX_BUFFS_AVAIL to an inline function with a memory barrier. Herbert and David suggested using the memory barrier instead of volatile. 5. Check for the full wake queue condition before getting netif_tx_lock in tg3_tx(). This reduces the number of unnecessary spinlocks when the tx ring is full in a steady-state condition. 6. Update version to 3.65. Signed-off-by: Michael Chan Acked-by: Herbert Xu Signed-off-by: David S. Miller --- drivers/net/tg3.c | 49 +++++++++++++++++++++++++++-------------------- drivers/net/tg3.h | 8 +++----- 2 files changed, 31 insertions(+), 26 deletions(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 0afbed6753fa..eafabb253f08 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -68,8 +68,8 @@ #define DRV_MODULE_NAME "tg3" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "3.64" -#define DRV_MODULE_RELDATE "July 31, 2006" +#define DRV_MODULE_VERSION "3.65" +#define DRV_MODULE_RELDATE "August 07, 2006" #define TG3_DEF_MAC_MODE 0 #define TG3_DEF_RX_MODE 0 @@ -123,9 +123,6 @@ TG3_RX_RCB_RING_SIZE(tp)) #define TG3_TX_RING_BYTES (sizeof(struct tg3_tx_buffer_desc) * \ TG3_TX_RING_SIZE) -#define TX_BUFFS_AVAIL(TP) \ - ((TP)->tx_pending - \ - (((TP)->tx_prod - (TP)->tx_cons) & (TG3_TX_RING_SIZE - 1))) #define NEXT_TX(N) (((N) + 1) & (TG3_TX_RING_SIZE - 1)) #define RX_PKT_BUF_SZ (1536 + tp->rx_offset + 64) @@ -2987,6 +2984,13 @@ static void tg3_tx_recover(struct tg3 *tp) spin_unlock(&tp->lock); } +static inline u32 tg3_tx_avail(struct tg3 *tp) +{ + smp_mb(); + return (tp->tx_pending - + ((tp->tx_prod - tp->tx_cons) & (TG3_TX_RING_SIZE - 1))); +} + /* Tigon3 never reports partial packet sends. So we do not * need special logic to handle SKBs that have not had all * of their frags sent yet, like SunGEM does. @@ -3038,12 +3042,20 @@ static void tg3_tx(struct tg3 *tp) tp->tx_cons = sw_idx; - if (unlikely(netif_queue_stopped(tp->dev))) { - spin_lock(&tp->tx_lock); + /* Need to make the tx_cons update visible to tg3_start_xmit() + * before checking for netif_queue_stopped(). Without the + * memory barrier, there is a small possibility that tg3_start_xmit() + * will miss it and cause the queue to be stopped forever. + */ + smp_mb(); + + if (unlikely(netif_queue_stopped(tp->dev) && + (tg3_tx_avail(tp) > TG3_TX_WAKEUP_THRESH))) { + netif_tx_lock(tp->dev); if (netif_queue_stopped(tp->dev) && - (TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH)) + (tg3_tx_avail(tp) > TG3_TX_WAKEUP_THRESH)) netif_wake_queue(tp->dev); - spin_unlock(&tp->tx_lock); + netif_tx_unlock(tp->dev); } } @@ -3795,7 +3807,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) * interrupt. Furthermore, IRQ processing runs lockless so we have * no IRQ context deadlocks to worry about either. Rejoice! */ - if (unlikely(TX_BUFFS_AVAIL(tp) <= (skb_shinfo(skb)->nr_frags + 1))) { + if (unlikely(tg3_tx_avail(tp) <= (skb_shinfo(skb)->nr_frags + 1))) { if (!netif_queue_stopped(dev)) { netif_stop_queue(dev); @@ -3891,12 +3903,10 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) tw32_tx_mbox((MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW), entry); tp->tx_prod = entry; - if (unlikely(TX_BUFFS_AVAIL(tp) <= (MAX_SKB_FRAGS + 1))) { - spin_lock(&tp->tx_lock); + if (unlikely(tg3_tx_avail(tp) <= (MAX_SKB_FRAGS + 1))) { netif_stop_queue(dev); - if (TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH) + if (tg3_tx_avail(tp) > TG3_TX_WAKEUP_THRESH) netif_wake_queue(tp->dev); - spin_unlock(&tp->tx_lock); } out_unlock: @@ -3918,7 +3928,7 @@ static int tg3_tso_bug(struct tg3 *tp, struct sk_buff *skb) struct sk_buff *segs, *nskb; /* Estimate the number of fragments in the worst case */ - if (unlikely(TX_BUFFS_AVAIL(tp) <= (skb_shinfo(skb)->gso_segs * 3))) { + if (unlikely(tg3_tx_avail(tp) <= (skb_shinfo(skb)->gso_segs * 3))) { netif_stop_queue(tp->dev); return NETDEV_TX_BUSY; } @@ -3958,7 +3968,7 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev) * interrupt. Furthermore, IRQ processing runs lockless so we have * no IRQ context deadlocks to worry about either. Rejoice! */ - if (unlikely(TX_BUFFS_AVAIL(tp) <= (skb_shinfo(skb)->nr_frags + 1))) { + if (unlikely(tg3_tx_avail(tp) <= (skb_shinfo(skb)->nr_frags + 1))) { if (!netif_queue_stopped(dev)) { netif_stop_queue(dev); @@ -4108,12 +4118,10 @@ static int tg3_start_xmit_dma_bug(struct sk_buff *skb, struct net_device *dev) tw32_tx_mbox((MAILBOX_SNDHOST_PROD_IDX_0 + TG3_64BIT_REG_LOW), entry); tp->tx_prod = entry; - if (unlikely(TX_BUFFS_AVAIL(tp) <= (MAX_SKB_FRAGS + 1))) { - spin_lock(&tp->tx_lock); + if (unlikely(tg3_tx_avail(tp) <= (MAX_SKB_FRAGS + 1))) { netif_stop_queue(dev); - if (TX_BUFFS_AVAIL(tp) > TG3_TX_WAKEUP_THRESH) + if (tg3_tx_avail(tp) > TG3_TX_WAKEUP_THRESH) netif_wake_queue(tp->dev); - spin_unlock(&tp->tx_lock); } out_unlock: @@ -11472,7 +11480,6 @@ static int __devinit tg3_init_one(struct pci_dev *pdev, tp->grc_mode |= GRC_MODE_BSWAP_NONFRM_DATA; #endif spin_lock_init(&tp->lock); - spin_lock_init(&tp->tx_lock); spin_lock_init(&tp->indirect_lock); INIT_WORK(&tp->reset_task, tg3_reset_task, tp); diff --git a/drivers/net/tg3.h b/drivers/net/tg3.h index ba2c98711c88..3ecf356cfb08 100644 --- a/drivers/net/tg3.h +++ b/drivers/net/tg3.h @@ -2079,9 +2079,9 @@ struct tg3 { * lock: Held during reset, PHY access, timer, and when * updating tg3_flags and tg3_flags2. * - * tx_lock: Held during tg3_start_xmit and tg3_tx only - * when calling netif_[start|stop]_queue. - * tg3_start_xmit is protected by netif_tx_lock. + * netif_tx_lock: Held during tg3_start_xmit. tg3_tx holds + * netif_tx_lock when it needs to call + * netif_wake_queue. * * Both of these locks are to be held with BH safety. * @@ -2118,8 +2118,6 @@ struct tg3 { u32 tx_cons; u32 tx_pending; - spinlock_t tx_lock; - struct tg3_tx_buffer_desc *tx_ring; struct tx_ring_info *tx_buffers; dma_addr_t tx_desc_mapping; From 70f8e78e150425b01c1099087ad3decacf7e4ccf Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 8 Aug 2006 16:47:37 -0700 Subject: [PATCH 10/12] [RTNETLINK]: Fix IFLA_ADDRESS handling. The ->set_mac_address handlers expect a pointer to a sockaddr which contains the MAC address, whereas IFLA_ADDRESS provides just the MAC address itself. So whip up a sockaddr to wrap around the netlink attribute for the ->set_mac_address call. Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 20e5bb73f147..30cc1ba6ed5c 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -394,6 +394,9 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } if (ida[IFLA_ADDRESS - 1]) { + struct sockaddr *sa; + int len; + if (!dev->set_mac_address) { err = -EOPNOTSUPP; goto out; @@ -405,7 +408,17 @@ static int do_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) if (ida[IFLA_ADDRESS - 1]->rta_len != RTA_LENGTH(dev->addr_len)) goto out; - err = dev->set_mac_address(dev, RTA_DATA(ida[IFLA_ADDRESS - 1])); + len = sizeof(sa_family_t) + dev->addr_len; + sa = kmalloc(len, GFP_KERNEL); + if (!sa) { + err = -ENOMEM; + goto out; + } + sa->sa_family = dev->type; + memcpy(sa->sa_data, RTA_DATA(ida[IFLA_ADDRESS - 1]), + dev->addr_len); + err = dev->set_mac_address(dev, sa); + kfree(sa); if (err) goto out; send_addr_notify = 1; From 7b1ba8de569460894efa892457af7a37c0d574f9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 8 Aug 2006 16:48:51 -0700 Subject: [PATCH 11/12] [IPX]: Another nonlinear receive fix Need to check some more cases in IPX receive. If the skb is purely fragments, the IPX header needs to be extracted. The function pskb_may_pull() may in theory invalidate all the pointers in the skb, so references to ipx header must be refreshed. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipx/af_ipx.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index c13e86b14f69..401964204866 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1642,14 +1642,17 @@ static int ipx_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_ty if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) goto out; - ipx = ipx_hdr(skb); - ipx_pktsize = ntohs(ipx->ipx_pktsize); + if (!pskb_may_pull(skb, sizeof(struct ipxhdr))) + goto drop; + + ipx_pktsize = ntohs(ipxhdr(skb)->ipx_pktsize); /* Too small or invalid header? */ if (ipx_pktsize < sizeof(struct ipxhdr) || !pskb_may_pull(skb, ipx_pktsize)) goto drop; + ipx = ipx_hdr(skb); if (ipx->ipx_checksum != IPX_NO_CHECKSUM && ipx->ipx_checksum != ipx_cksum(ipx, ipx_pktsize)) goto drop; From 7c91767a6b701543c93ebcd611dab61deff3dad1 Mon Sep 17 00:00:00 2001 From: Dmitry Mishin Date: Wed, 9 Aug 2006 02:25:54 -0700 Subject: [PATCH 12/12] [NET]: add_timer -> mod_timer() in dst_run_gc() Patch from Dmitry Mishin : Replace add_timer() by mod_timer() in dst_run_gc in order to avoid BUG message. CPU1 CPU2 dst_run_gc() entered dst_run_gc() entered spin_lock(&dst_lock) ..... del_timer(&dst_gc_timer) fail to get lock .... mod_timer() <--- puts timer back to the list add_timer(&dst_gc_timer) <--- BUG because timer is in list already. Found during OpenVZ internal testing. At first we thought that it is OpenVZ specific as we added dst_run_gc(0) call in dst_dev_event(), but as Alexey pointed to me it is possible to trigger this condition in mainstream kernel. F.e. timer has fired on CPU2, but the handler was preeempted by an irq before dst_lock is tried. Meanwhile, someone on CPU1 adds an entry to gc list and starts the timer. If CPU2 was preempted long enough, this timer can expire simultaneously with resuming timer handler on CPU1, arriving exactly to the situation described. Signed-off-by: Dmitry Mishin Signed-off-by: Kirill Korotaev Signed-off-by: Alexey Kuznetsov Signed-off-by: David S. Miller --- net/core/dst.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/core/dst.c b/net/core/dst.c index 470c05bc4cb2..1a5e49da0e77 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -95,12 +95,11 @@ static void dst_run_gc(unsigned long dummy) dst_gc_timer_inc = DST_GC_INC; dst_gc_timer_expires = DST_GC_MIN; } - dst_gc_timer.expires = jiffies + dst_gc_timer_expires; #if RT_CACHE_DEBUG >= 2 printk("dst_total: %d/%d %ld\n", atomic_read(&dst_total), delayed, dst_gc_timer_expires); #endif - add_timer(&dst_gc_timer); + mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires); out: spin_unlock(&dst_lock);