From 3098ac39630c8908fb13cd3137a4629c04df9fe3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 21 May 2015 12:40:14 +0900 Subject: [PATCH 1/4] rocker: do not delete fdb entries in rocker_port_fdb_flush() when preparing transactions rocker_port_fdb_flush() is called by rocker_port_stp_update() which in turn may be called with trans == SWITCHDEV_TRANS_PREPARE and then trans == SWITCHDEV_TRANS_COMMIT from switchdev_port_attr_set() via br_set_state(). When rocker_port_fdb_flush() is called with trans == SWITCHDEV_TRANS_PREPARE it calls rocker_port_fdb_learn() for each entry in the FDB table which in turn calls rocker_flow_tbl_bridge() which will allocate memory using rocker_port_kzalloc(). rocker_port_fdb_learn() will then remove the entry from the FDB table. Then when rocker_port_fdb_learn() is called with trans == SWITCHDEV_TRANS_PREPARE no calls are made to rocker_port_fdb_learn() because there are no longer any entries present in the FDB table. Thus the memory previously allocated by rocker_port_fdb_learn() is leaked resulting in the kernel BUG() below. Furthermore, it looks like the driver ends up with an incorrect view of the fdb table as the FDB entries are purged from the driver's table but not the hardware's table. ip link add br0 type bridge ip link set up dev eth0 sleep 1 ip link set dev eth0 master br0 [ 3.704360] ------------[ cut here ]------------ [ 3.704611] kernel BUG at drivers/net/ethernet/rocker/rocker.c:4289! [ 3.704962] invalid opcode: 0000 [#1] SMP [ 3.705537] Modules linked in: [ 3.705919] CPU: 0 PID: 63 Comm: ip Not tainted 4.1.0-rc3-01046-gb9fbe709de4d #1044 [ 3.706191] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.0-0-g4c59f5d-20150219_092859-nilsson.home.kraxel.org 04/01/2014 [ 3.706820] task: ffff880019f70150 ti: ffff88001f92c000 task.ti: ffff88001f92c000 [ 3.707138] RIP: 0010:[] [] rocker_port_attr_set+0xe0/0xf0 [ 3.707990] RSP: 0018:ffff88001f92f808 EFLAGS: 00000212 [ 3.708200] RAX: ffff880019d4fa68 RBX: ffff880019d4f000 RCX: 0000000000000000 [ 3.708471] RDX: 000000000000000c RSI: ffff88001f92f890 RDI: ffff880019d4f680 [ 3.708740] RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000004 [ 3.708999] R10: ffff880000034024 R11: 0000000000000000 R12: ffff88001f92f890 [ 3.709276] R13: ffff88001f8f1c00 R14: 000000000000000b R15: 0000000000000000 [ 3.709303] FS: 00007f8ab66bd700(0000) GS:ffff88001b000000(0000) knlGS:0000000000000000 [ 3.709303] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3.709303] CR2: 0000000000654988 CR3: 000000001f8f3000 CR4: 00000000000006b0 [ 3.709303] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 3.709303] DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000 [ 3.709303] Stack: [ 3.709303] ffff88001f8f1c00 000000000000000b ffff88001f92f890 ffff880019d4f000 [ 3.709303] ffff88001f92f890 ffffffff813332f5 ffff88001f92f880 0000000000000000 [ 3.709303] ffff88001f92f890 0000000000000001 ffff880019d4f000 ffffffff81333627 [ 3.709303] Call Trace: [ 3.709303] [] ? __switchdev_port_attr_set+0x25/0x90 [ 3.709303] [] ? switchdev_port_attr_set+0x27/0x120 [ 3.709303] [] ? br_set_state+0x36/0x50 [ 3.709303] [] ? br_add_if+0x37c/0x400 [ 3.709303] [] ? do_setlink+0x7e1/0x800 [ 3.709303] [] ? radix_tree_lookup_slot+0x10/0x30 [ 3.709303] [] ? nla_parse+0xaa/0x110 [ 3.709303] [] ? rtnl_newlink+0x548/0x870 [ 3.709303] [] ? __radix_tree_lookup+0x40/0xb0 [ 3.709303] [] ? nla_parse+0x2e/0x110 [ 3.709303] [] ? rtnetlink_rcv_msg+0x7e/0x250 [ 3.709303] [] ? __skb_recv_datagram+0xfe/0x4b0 [ 3.709303] [] ? rtnetlink_rcv+0x30/0x30 [ 3.709303] [] ? netlink_rcv_skb+0xa8/0xd0 [ 3.709303] [] ? rtnetlink_rcv+0x1f/0x30 [ 3.709303] [] ? netlink_unicast+0x150/0x200 [ 3.709303] [] ? netlink_sendmsg+0x374/0x3e0 [ 3.709303] [] ? sock_sendmsg+0xf/0x30 [ 3.709303] [] ? ___sys_sendmsg+0x1f3/0x200 [ 3.709303] [] ? ___sys_recvmsg+0x105/0x140 [ 3.709303] [] ? dev_get_by_name_rcu+0x69/0x90 [ 3.709303] [] ? dev_get_by_name_rcu+0x69/0x90 [ 3.709303] [] ? skb_dequeue+0x4d/0x60 [ 3.709303] [] ? skb_queue_purge+0x20/0x30 [ 3.709303] [] ? __inode_wait_for_writeback+0x5f/0xb0 [ 3.709303] [] ? autoremove_wake_function+0x30/0x30 [ 3.709303] [] ? __sys_sendmsg+0x39/0x70 [ 3.709303] [] ? system_call_fastpath+0x12/0x6a [ 3.709303] Code: bb 90 06 00 00 48 c7 04 24 00 00 00 00 45 31 c9 45 31 c0 48 c7 c1 c0 b7 1e 81 89 ea e8 da da ff ff eb 95 0f 1f 84 00 00 00 00 00 <0f> 0b 66 66 66 66 66 2e 0f 1f 84 00 00 00 00 00 48 83 fe 15 75 [ 3.709303] RIP [] rocker_port_attr_set+0xe0/0xf0 [ 3.709303] RSP [ 3.721409] ---[ end trace b7481fcb7cb032aa ]--- Segmentation fault Fixes: c4f20321d968 ("rocker: support prepare-commit transaction model") Acked-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 0f5e962d691c..ce1bfab077a7 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -3658,7 +3658,8 @@ static int rocker_port_fdb_flush(struct rocker_port *rocker_port, found->key.vlan_id); if (err) goto err_out; - hash_del(&found->entry); + if (trans != SWITCHDEV_TRANS_PREPARE) + hash_del(&found->entry); } err_out: From 42e9488971f26c07719e99e33d14811488d31632 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 21 May 2015 12:40:15 +0900 Subject: [PATCH 2/4] rocker: do not modify fdb table in rocker_port_fdb() when preparing transactions rocker_port_fdb_flush() may be called be called with trans == SWITCHDEV_TRANS_PREPARE and then trans == SWITCHDEV_TRANS_COMMIT from switchdev_port_attr_set() via switchdev_port_obj_add(). Adding the new entry to the FDB table when trans == SWITCHDEV_TRANS_PREPARE may result in a memory leak because when trans == SWITCHDEV_TRANS_PREPARE rocker_flow_tbl_bridge() will allocate memory when called via rocker_port_fdb_learn(). However, when trans == SWITCHDEV_TRANS_COMMIT the presence of the FDB entry in the FDB table causes rocker_port_fdb() to set the ROCKER_OP_FLAG_REFRESH flag which results in rocker_port_fdb_learn() skipping the call to rocker_flow_tbl_bridge() which would free the memory allocated by it when trans == SWITCHDEV_TRANS_PREPARE. ip link add br0 type bridge ip link set up dev eth0 ip link set dev eth0 master br0 bridge fdb add 52:54:00:12:35:08 dev eth0 bridge fdb add 52:54:00:12:35:09 dev eth0 [ 2.600730] ------------[ cut here ]------------ [ 2.601002] kernel BUG at drivers/net/ethernet/rocker/rocker.c:4369! [ 2.601373] invalid opcode: 0000 [#1] SMP [ 2.601963] Modules linked in: [ 2.602355] CPU: 0 PID: 64 Comm: bridge Not tainted 4.1.0-rc3-01048-g6d0f50c50211-dirty #1075 [ 2.602721] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.8.0-0-g4c59f5d-20150219_092859-nilsson.home.kraxel.org 04/01/2014 [ 2.602721] task: ffff880019facef0 ti: ffff88001f96c000 task.ti: ffff88001f96c000 [ 2.602721] RIP: 0010:[] [] rocker_port_obj_add+0x150/0x160 [ 2.602721] RSP: 0018:ffff88001f96fa98 EFLAGS: 00000212 [ 2.602721] RAX: ffff880019d4fa68 RBX: ffff88001f96fb18 RCX: 0000000000000000 [ 2.602721] RDX: ffff880019d4f000 RSI: ffff88001f96fb18 RDI: ffff880019d4f000 [ 2.602721] RBP: 0000000000000001 R08: 0000000000000000 R09: ffff88001f904620 [ 2.602721] R10: ffff88001f96fb60 R11: ffff880019e9d100 R12: ffff88001f96fb18 [ 2.602721] R13: ffff880019d4f680 R14: ffff88001f904610 R15: ffff8800198f7b80 [ 2.602721] FS: 00007f3eee917700(0000) GS:ffff88001b000000(0000) knlGS:0000000000000000 [ 2.602721] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 2.602721] CR2: 00007f3eee4a15cb CR3: 000000001f933000 CR4: 00000000000006b0 [ 2.602721] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 2.602721] DR3: 0000000000000000 DR6: 0000000000000000 DR7: 0000000000000000 [ 2.602721] Stack: [ 2.602721] 0000000000000000 ffff88001f96fb18 ffff880019d4f000 ffff88001f96fb18 [ 2.602721] ffff880019d4f000 ffffffff81332105 ffff88001f96fb50 ffffffff814464c0 [ 2.602721] ffff88001f96fb18 ffff88001f904600 ffff880019d4f000 ffffffff813326e5 [ 2.602721] Call Trace: [ 2.602721] [] ? __switchdev_port_obj_add+0x25/0x90 [ 2.602721] [] ? switchdev_port_obj_add+0x25/0xc0 [ 2.602721] [] ? switchdev_port_fdb_add+0x31/0x40 [ 2.602721] [] ? rtnl_fdb_add+0xff/0x1e0 [ 2.602721] [] ? rtnetlink_rcv_msg+0x7e/0x250 [ 2.602721] [] ? __skb_recv_datagram+0xfe/0x4b0 [ 2.602721] [] ? rtnetlink_rcv+0x30/0x30 [ 2.602721] [] ? netlink_rcv_skb+0xa8/0xd0 [ 2.602721] [] ? rtnetlink_rcv+0x1f/0x30 [ 2.602721] [] ? netlink_unicast+0x150/0x200 [ 2.602721] [] ? netlink_sendmsg+0x374/0x3e0 [ 2.602721] [] ? sock_sendmsg+0xf/0x30 [ 2.602721] [] ? ___sys_sendmsg+0x1f3/0x200 [ 2.602721] [] ? ___sys_recvmsg+0x105/0x140 [ 2.602721] [] ? SyS_readahead+0x90/0x90 [ 2.602721] [] ? filemap_map_pages+0x1ed/0x210 [ 2.602721] [] ? handle_mm_fault+0x5fc/0xe50 [ 2.602721] [] ? __sys_sendmsg+0x39/0x70 [ 2.602721] [] ? system_call_fastpath+0x12/0x6a [ 2.602721] Code: b7 8f a0 06 00 00 48 83 bf 88 06 00 00 00 74 1d 48 83 c4 08 89 ee 4c 89 ef 5b 5d 41 5c 41 5d 0f b7 c9 45 31 c0 e9 51 db ff ff 90 <0f> 0b b8 ea ff ff ff e9 cf fe ff ff 0f 1f 40 00 41 57 41 56 b9 [ 2.602721] RIP [] rocker_port_obj_add+0x150/0x160 [ 2.602721] RSP [ 2.615848] ---[ end trace 4f7b4f1c98077108 ]--- The above is resolved by not adding the new FDB entry to the FDB table if trans == SWITCHDEV_TRANS_PREPARE. For symmetry this patch also skips deleting FDB entries from the FDB table trans == SWITCHDEV_TRANS_PREPARE. However, my analysis is that this never occurs as trans is always SWITCHDEV_TRANS_NONE when removing FDB entries. Fixes: c4f20321d968 ("rocker: support prepare-commit transaction model") Acked-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index ce1bfab077a7..89d22bdcbdc4 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -3612,9 +3612,11 @@ static int rocker_port_fdb(struct rocker_port *rocker_port, if (removing && found) { rocker_port_kfree(rocker_port, trans, fdb); - hash_del(&found->entry); + if (trans != SWITCHDEV_TRANS_PREPARE) + hash_del(&found->entry); } else if (!removing && !found) { - hash_add(rocker->fdb_tbl, &fdb->entry, fdb->key_crc32); + if (trans != SWITCHDEV_TRANS_PREPARE) + hash_add(rocker->fdb_tbl, &fdb->entry, fdb->key_crc32); } spin_unlock_irqrestore(&rocker->fdb_tbl_lock, lock_flags); From 550ecc92feb10fdb7ab1d052707b11bca955de5f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 21 May 2015 12:40:16 +0900 Subject: [PATCH 3/4] rocker: do not make neighbour entry changes when preparing transactions rocker_port_ipv4_nh() and in turn rocker_port_ipv4_neigh() may be be called with trans == SWITCHDEV_TRANS_PREPARE and then trans == SWITCHDEV_TRANS_COMMIT from switchdev_port_obj_set() via fib_table_insert(). The first time that rocker_port_ipv4_nh() is called, with trans == SWITCHDEV_TRANS_PREPARE, _rocker_neigh_add() adds a new entry to the neigh table. And the second time rocker_port_ipv4_nh() is called, with trans == SWITCHDEV_TRANS_COMMIT, that entry is found. This causes rocker_port_ipv4_nh() to believe it is not adding an entry and thus it frees "entry", which is still present in rocker driver's neigh table. This problem does not appear to affect deletion as my analysis is that deletion is always performed with trans == SWITCHDEV_TRANS_NONE. For completeness _rocker_neigh_{add,del,prepare} are updated not to manipulate fib table entries if trans == SWITCHDEV_TRANS_PREPARE. Fixes: c4f20321d968 ("rocker: support prepare-commit transaction model") Reported-by: Toshiaki Makita Acked-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 89d22bdcbdc4..51744764acd7 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -2909,9 +2909,13 @@ static struct rocker_neigh_tbl_entry * } static void _rocker_neigh_add(struct rocker *rocker, + enum switchdev_trans trans, struct rocker_neigh_tbl_entry *entry) { - entry->index = rocker->neigh_tbl_next_index++; + entry->index = rocker->neigh_tbl_next_index; + if (trans == SWITCHDEV_TRANS_PREPARE) + return; + rocker->neigh_tbl_next_index++; entry->ref_count++; hash_add(rocker->neigh_tbl, &entry->entry, be32_to_cpu(entry->ip_addr)); @@ -2921,6 +2925,8 @@ static void _rocker_neigh_del(struct rocker_port *rocker_port, enum switchdev_trans trans, struct rocker_neigh_tbl_entry *entry) { + if (trans == SWITCHDEV_TRANS_PREPARE) + return; if (--entry->ref_count == 0) { hash_del(&entry->entry); rocker_port_kfree(rocker_port, trans, entry); @@ -2928,12 +2934,13 @@ static void _rocker_neigh_del(struct rocker_port *rocker_port, } static void _rocker_neigh_update(struct rocker_neigh_tbl_entry *entry, + enum switchdev_trans trans, u8 *eth_dst, bool ttl_check) { if (eth_dst) { ether_addr_copy(entry->eth_dst, eth_dst); entry->ttl_check = ttl_check; - } else { + } else if (trans != SWITCHDEV_TRANS_PREPARE) { entry->ref_count++; } } @@ -2973,12 +2980,12 @@ static int rocker_port_ipv4_neigh(struct rocker_port *rocker_port, entry->dev = rocker_port->dev; ether_addr_copy(entry->eth_dst, eth_dst); entry->ttl_check = true; - _rocker_neigh_add(rocker, entry); + _rocker_neigh_add(rocker, trans, entry); } else if (removing) { memcpy(entry, found, sizeof(*entry)); _rocker_neigh_del(rocker_port, trans, found); } else if (updating) { - _rocker_neigh_update(found, eth_dst, true); + _rocker_neigh_update(found, trans, eth_dst, true); memcpy(entry, found, sizeof(*entry)); } else { err = -ENOENT; @@ -3089,13 +3096,13 @@ static int rocker_port_ipv4_nh(struct rocker_port *rocker_port, if (adding) { entry->ip_addr = ip_addr; entry->dev = rocker_port->dev; - _rocker_neigh_add(rocker, entry); + _rocker_neigh_add(rocker, trans, entry); *index = entry->index; resolved = false; } else if (removing) { _rocker_neigh_del(rocker_port, trans, found); } else if (updating) { - _rocker_neigh_update(found, NULL, false); + _rocker_neigh_update(found, trans, NULL, false); resolved = !is_zero_ether_addr(found->eth_dst); } else { err = -ENOENT; From df6a20673011e89f7fbe3d667eee0a9550679841 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 21 May 2015 12:40:17 +0900 Subject: [PATCH 4/4] rocker: make rocker_port_internal_vlan_id_{get, put}() non-transactional The motivation for this is that rocker_port_internal_vlan_id_{get,put} appear to only partially implement the transaction model: memory allocation and freeing is transactional, but hash and bitmap manipulation is not. The latter could be fixed, however, as it is not currently exercised due to trans always being SWITCHDEV_TRANS_NONE it seems cleaner to make rocker_port_internal_vlan_id_get non-transactional. This problem was introduced by c4f20321d968 ("rocker: support prepare-commit transaction model"). Found by inspection. I do not believe that this change should have any run-time effect. Acked-by: Scott Feldman Acked-by: Jiri Pirko Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/ethernet/rocker/rocker.c | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c index 51744764acd7..701ffc200917 100644 --- a/drivers/net/ethernet/rocker/rocker.c +++ b/drivers/net/ethernet/rocker/rocker.c @@ -3853,7 +3853,6 @@ rocker_internal_vlan_tbl_find(struct rocker *rocker, int ifindex) } static __be16 rocker_port_internal_vlan_id_get(struct rocker_port *rocker_port, - enum switchdev_trans trans, int ifindex) { struct rocker *rocker = rocker_port->rocker; @@ -3862,7 +3861,7 @@ static __be16 rocker_port_internal_vlan_id_get(struct rocker_port *rocker_port, unsigned long lock_flags; int i; - entry = rocker_port_kzalloc(rocker_port, trans, sizeof(*entry)); + entry = kzalloc(sizeof(*entry), GFP_KERNEL); if (!entry) return 0; @@ -3872,7 +3871,7 @@ static __be16 rocker_port_internal_vlan_id_get(struct rocker_port *rocker_port, found = rocker_internal_vlan_tbl_find(rocker, ifindex); if (found) { - rocker_port_kfree(rocker_port, trans, entry); + kfree(entry); goto found; } @@ -3896,7 +3895,6 @@ found: } static void rocker_port_internal_vlan_id_put(struct rocker_port *rocker_port, - enum switchdev_trans trans, int ifindex) { struct rocker *rocker = rocker_port->rocker; @@ -3918,7 +3916,7 @@ static void rocker_port_internal_vlan_id_put(struct rocker_port *rocker_port, bit = ntohs(found->vlan_id) - ROCKER_INTERNAL_VLAN_ID_BASE; clear_bit(bit, rocker->internal_vlan_bitmap); hash_del(&found->entry); - rocker_port_kfree(rocker_port, trans, found); + kfree(found); } not_found: @@ -4904,9 +4902,7 @@ static int rocker_probe_port(struct rocker *rocker, unsigned int port_number) rocker_port_set_learning(rocker_port, SWITCHDEV_TRANS_NONE); rocker_port->internal_vlan_id = - rocker_port_internal_vlan_id_get(rocker_port, - SWITCHDEV_TRANS_NONE, - dev->ifindex); + rocker_port_internal_vlan_id_get(rocker_port, dev->ifindex); err = rocker_port_ig_tbl(rocker_port, SWITCHDEV_TRANS_NONE, 0); if (err) { dev_err(&pdev->dev, "install ig port table failed\n"); @@ -5154,7 +5150,7 @@ static int rocker_port_bridge_join(struct rocker_port *rocker_port, { int err; - rocker_port_internal_vlan_id_put(rocker_port, SWITCHDEV_TRANS_NONE, + rocker_port_internal_vlan_id_put(rocker_port, rocker_port->dev->ifindex); rocker_port->bridge_dev = bridge; @@ -5165,9 +5161,7 @@ static int rocker_port_bridge_join(struct rocker_port *rocker_port, if (err) return err; rocker_port->internal_vlan_id = - rocker_port_internal_vlan_id_get(rocker_port, - SWITCHDEV_TRANS_NONE, - bridge->ifindex); + rocker_port_internal_vlan_id_get(rocker_port, bridge->ifindex); return rocker_port_vlan(rocker_port, SWITCHDEV_TRANS_NONE, 0, 0); } @@ -5175,7 +5169,7 @@ static int rocker_port_bridge_leave(struct rocker_port *rocker_port) { int err; - rocker_port_internal_vlan_id_put(rocker_port, SWITCHDEV_TRANS_NONE, + rocker_port_internal_vlan_id_put(rocker_port, rocker_port->bridge_dev->ifindex); rocker_port->bridge_dev = NULL; @@ -5187,7 +5181,6 @@ static int rocker_port_bridge_leave(struct rocker_port *rocker_port) return err; rocker_port->internal_vlan_id = rocker_port_internal_vlan_id_get(rocker_port, - SWITCHDEV_TRANS_NONE, rocker_port->dev->ifindex); err = rocker_port_vlan(rocker_port, SWITCHDEV_TRANS_NONE, 0, 0); if (err)