From e0cbb79805083b8862182341ebf72266d58f6d12 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:26:58 -0700 Subject: [PATCH 01/19] ocfs2: o2hb: add negotiate timer This series of patches is to fix the issue that when storage down, all nodes will fence self due to write timeout. With this patch set, all nodes will keep going until storage back online, except if the following issue happens, then all nodes will do as before to fence self. 1. io error got 2. network between nodes down 3. nodes panic This patch (of 6): When storage down, all nodes will fence self due to write timeout. The negotiate timer is designed to avoid this, with it node will wait until storage up again. Negotiate timer working in the following way: 1. The timer expires before write timeout timer, its timeout is half of write timeout now. It is re-queued along with write timeout timer. If expires, it will send NEGO_TIMEOUT message to master node(node with lowest node number). This message does nothing but marks a bit in a bitmap recording which nodes are negotiating timeout on master node. 2. If storage down, nodes will send this message to master node, then when master node finds its bitmap including all online nodes, it sends NEGO_APPROVL message to all nodes one by one, this message will re-queue write timeout timer and negotiate timer. For any node doesn't receive this message or meets some issue when handling this message, it will be fenced. If storage up at any time, o2hb_thread will run and re-queue all the timer, nothing will be affected by these two steps. Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding Reviewed-by: Mark Fasheh Cc: Gang He Cc: rwxybh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 51 +++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index a8d15beee5cb..750c950f4e1f 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -272,6 +272,10 @@ struct o2hb_region { struct delayed_work hr_write_timeout_work; unsigned long hr_last_timeout_start; + /* negotiate timer, used to negotiate extending hb timeout. */ + struct delayed_work hr_nego_timeout_work; + unsigned long hr_nego_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + /* Used during o2hb_check_slot to hold a copy of the block * being checked because we temporarily have to zero out the * crc field. */ @@ -319,7 +323,7 @@ static void o2hb_write_timeout(struct work_struct *work) o2quo_disk_timeout(); } -static void o2hb_arm_write_timeout(struct o2hb_region *reg) +static void o2hb_arm_timeout(struct o2hb_region *reg) { /* Arm writeout only after thread reaches steady state */ if (atomic_read(®->hr_steady_iterations) != 0) @@ -337,11 +341,49 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); + + cancel_delayed_work(®->hr_nego_timeout_work); + /* negotiate timeout must be less than write timeout. */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)/2); + memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); } -static void o2hb_disarm_write_timeout(struct o2hb_region *reg) +static void o2hb_disarm_timeout(struct o2hb_region *reg) { cancel_delayed_work_sync(®->hr_write_timeout_work); + cancel_delayed_work_sync(®->hr_nego_timeout_work); +} + +static void o2hb_nego_timeout(struct work_struct *work) +{ + unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; + int master_node; + struct o2hb_region *reg; + + reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); + /* lowest node as master node to make negotiate decision. */ + master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); + + if (master_node == o2nm_this_node()) { + set_bit(master_node, reg->hr_nego_node_bitmap); + if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, + sizeof(reg->hr_nego_node_bitmap))) { + /* check negotiate bitmap every second to do timeout + * approve decision. + */ + schedule_delayed_work(®->hr_nego_timeout_work, + msecs_to_jiffies(1000)); + + return; + } + + /* approve negotiate timeout request. */ + } else { + /* negotiate timeout with master node. */ + } + } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -1032,7 +1074,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) /* Skip disarming the timeout if own slot has stale/bad data */ if (own_slot_ok) { o2hb_set_quorum_device(reg); - o2hb_arm_write_timeout(reg); + o2hb_arm_timeout(reg); } bail: @@ -1114,7 +1156,7 @@ static int o2hb_thread(void *data) } } - o2hb_disarm_write_timeout(reg); + o2hb_disarm_timeout(reg); /* unclean stop is only used in very bad situation */ for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++) @@ -1762,6 +1804,7 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } INIT_DELAYED_WORK(®->hr_write_timeout_work, o2hb_write_timeout); + INIT_DELAYED_WORK(®->hr_nego_timeout_work, o2hb_nego_timeout); /* * A node is considered live after it has beat LIVE_THRESHOLD From 34069b886f95356d68bf8315fa648c4ab3193cdd Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:27:01 -0700 Subject: [PATCH 02/19] ocfs2: o2hb: add NEGO_TIMEOUT message This message is sent to master node when non-master nodes's negotiate timer expired. Master node records these nodes in a bitmap which is used to do write timeout timer re-queue decision. Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding Reviewed-by: Mark Fasheh Cc: Gang He Cc: rwxybh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 66 +++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 750c950f4e1f..454c89076833 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -280,6 +280,10 @@ struct o2hb_region { * being checked because we temporarily have to zero out the * crc field. */ struct o2hb_disk_heartbeat_block *hr_tmp_block; + + /* Message key for negotiate timeout message. */ + unsigned int hr_key; + struct list_head hr_handler_list; }; struct o2hb_bio_wait_ctxt { @@ -288,6 +292,14 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +enum { + O2HB_NEGO_TIMEOUT_MSG = 1, +}; + +struct o2hb_nego_msg { + u8 node_num; +}; + static void o2hb_write_timeout(struct work_struct *work) { int failed, quorum; @@ -355,6 +367,24 @@ static void o2hb_disarm_timeout(struct o2hb_region *reg) cancel_delayed_work_sync(®->hr_nego_timeout_work); } +static int o2hb_send_nego_msg(int key, int type, u8 target) +{ + struct o2hb_nego_msg msg; + int status, ret; + + msg.node_num = o2nm_this_node(); +again: + ret = o2net_send_message(type, key, &msg, sizeof(msg), + target, &status); + + if (ret == -EAGAIN || ret == -ENOMEM) { + msleep(100); + goto again; + } + + return ret; +} + static void o2hb_nego_timeout(struct work_struct *work) { unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -382,8 +412,24 @@ static void o2hb_nego_timeout(struct work_struct *work) /* approve negotiate timeout request. */ } else { /* negotiate timeout with master node. */ + o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); } +} +static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + struct o2hb_region *reg = data; + struct o2hb_nego_msg *nego_msg; + + nego_msg = (struct o2hb_nego_msg *)msg->buf; + if (nego_msg->node_num < O2NM_MAX_NODES) + set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); + else + mlog(ML_ERROR, "got nego timeout message from bad node.\n"); + + return 0; } static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) @@ -1493,6 +1539,7 @@ static void o2hb_region_release(struct config_item *item) list_del(®->hr_all_item); spin_unlock(&o2hb_live_lock); + o2net_unregister_handler_list(®->hr_handler_list); kfree(reg); } @@ -2038,13 +2085,30 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g config_item_init_type_name(®->hr_item, name, &o2hb_region_type); + /* this is the same way to generate msg key as dlm, for local heartbeat, + * name is also the same, so make initial crc value different to avoid + * message key conflict. + */ + reg->hr_key = crc32_le(reg->hr_region_num + O2NM_MAX_REGIONS, + name, strlen(name)); + INIT_LIST_HEAD(®->hr_handler_list); + ret = o2net_register_handler(O2HB_NEGO_TIMEOUT_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_timeout_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto free; + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); - goto free; + goto unregister_handler; } return ®->hr_item; + +unregister_handler: + o2net_unregister_handler_list(®->hr_handler_list); free: kfree(reg); return ERR_PTR(ret); From e76f8237a2f7b7220980c0fb3c6d0b1d48ba79ad Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:27:04 -0700 Subject: [PATCH 03/19] ocfs2: o2hb: add NEGOTIATE_APPROVE message This message is used to re-queue write timeout timer and negotiate timer when all nodes suffer a write hung to storage, this makes node not fence self if storage down. Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding Reviewed-by: Mark Fasheh Cc: Gang He Cc: rwxybh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 454c89076833..e929b15d6162 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -294,6 +294,7 @@ struct o2hb_bio_wait_ctxt { enum { O2HB_NEGO_TIMEOUT_MSG = 1, + O2HB_NEGO_APPROVE_MSG = 2, }; struct o2hb_nego_msg { @@ -388,7 +389,7 @@ again: static void o2hb_nego_timeout(struct work_struct *work) { unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; - int master_node; + int master_node, i; struct o2hb_region *reg; reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); @@ -410,6 +411,17 @@ static void o2hb_nego_timeout(struct work_struct *work) } /* approve negotiate timeout request. */ + o2hb_arm_timeout(reg); + + i = -1; + while ((i = find_next_bit(live_node_bitmap, + O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { + if (i == master_node) + continue; + + o2hb_send_nego_msg(reg->hr_key, + O2HB_NEGO_APPROVE_MSG, i); + } } else { /* negotiate timeout with master node. */ o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, @@ -432,6 +444,13 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, return 0; } +static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, + void **ret_data) +{ + o2hb_arm_timeout(data); + return 0; +} + static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc) { atomic_set(&wc->wc_num_reqs, 1); @@ -2099,6 +2118,13 @@ static struct config_item *o2hb_heartbeat_group_make_item(struct config_group *g if (ret) goto free; + ret = o2net_register_handler(O2HB_NEGO_APPROVE_MSG, reg->hr_key, + sizeof(struct o2hb_nego_msg), + o2hb_nego_approve_handler, + reg, NULL, ®->hr_handler_list); + if (ret) + goto unregister_handler; + ret = o2hb_debug_region_init(reg, o2hb_debug_dir); if (ret) { config_item_put(®->hr_item); From 1bd1290283d7939478062e80bdd9719d3a21522f Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:27:07 -0700 Subject: [PATCH 04/19] ocfs2: o2hb: add some user/debug log Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding Reviewed-by: Mark Fasheh Cc: Gang He Cc: rwxybh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 39 +++++++++++++++++++++++++++++------- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index e929b15d6162..84ebeb5678c6 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -292,6 +292,8 @@ struct o2hb_bio_wait_ctxt { int wc_error; }; +#define O2HB_NEGO_TIMEOUT_MS (O2HB_MAX_WRITE_TIMEOUT_MS/2) + enum { O2HB_NEGO_TIMEOUT_MSG = 1, O2HB_NEGO_APPROVE_MSG = 2, @@ -358,7 +360,7 @@ static void o2hb_arm_timeout(struct o2hb_region *reg) cancel_delayed_work(®->hr_nego_timeout_work); /* negotiate timeout must be less than write timeout. */ schedule_delayed_work(®->hr_nego_timeout_work, - msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)/2); + msecs_to_jiffies(O2HB_NEGO_TIMEOUT_MS)); memset(reg->hr_nego_node_bitmap, 0, sizeof(reg->hr_nego_node_bitmap)); } @@ -389,7 +391,7 @@ again: static void o2hb_nego_timeout(struct work_struct *work) { unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; - int master_node, i; + int master_node, i, ret; struct o2hb_region *reg; reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); @@ -398,7 +400,12 @@ static void o2hb_nego_timeout(struct work_struct *work) master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); if (master_node == o2nm_this_node()) { - set_bit(master_node, reg->hr_nego_node_bitmap); + if (!test_bit(master_node, reg->hr_nego_node_bitmap)) { + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s).\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, + config_item_name(®->hr_item), reg->hr_dev_name); + set_bit(master_node, reg->hr_nego_node_bitmap); + } if (memcmp(reg->hr_nego_node_bitmap, live_node_bitmap, sizeof(reg->hr_nego_node_bitmap))) { /* check negotiate bitmap every second to do timeout @@ -410,6 +417,8 @@ static void o2hb_nego_timeout(struct work_struct *work) return; } + printk(KERN_NOTICE "o2hb: all nodes hb write hung, maybe region %s (%s) is down.\n", + config_item_name(®->hr_item), reg->hr_dev_name); /* approve negotiate timeout request. */ o2hb_arm_timeout(reg); @@ -419,13 +428,23 @@ static void o2hb_nego_timeout(struct work_struct *work) if (i == master_node) continue; - o2hb_send_nego_msg(reg->hr_key, + mlog(ML_HEARTBEAT, "send NEGO_APPROVE msg to node %d\n", i); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_APPROVE_MSG, i); + if (ret) + mlog(ML_ERROR, "send NEGO_APPROVE msg to node %d fail %d\n", + i, ret); } } else { /* negotiate timeout with master node. */ - o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, - master_node); + printk(KERN_NOTICE "o2hb: node %d hb write hung for %ds on region %s (%s), negotiate timeout with node %d.\n", + o2nm_this_node(), O2HB_NEGO_TIMEOUT_MS/1000, config_item_name(®->hr_item), + reg->hr_dev_name, master_node); + ret = o2hb_send_nego_msg(reg->hr_key, O2HB_NEGO_TIMEOUT_MSG, + master_node); + if (ret) + mlog(ML_ERROR, "send NEGO_TIMEOUT msg to node %d fail %d\n", + master_node, ret); } } @@ -436,6 +455,8 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, struct o2hb_nego_msg *nego_msg; nego_msg = (struct o2hb_nego_msg *)msg->buf; + printk(KERN_NOTICE "o2hb: receive negotiate timeout message from node %d on region %s (%s).\n", + nego_msg->node_num, config_item_name(®->hr_item), reg->hr_dev_name); if (nego_msg->node_num < O2NM_MAX_NODES) set_bit(nego_msg->node_num, reg->hr_nego_node_bitmap); else @@ -447,7 +468,11 @@ static int o2hb_nego_timeout_handler(struct o2net_msg *msg, u32 len, void *data, static int o2hb_nego_approve_handler(struct o2net_msg *msg, u32 len, void *data, void **ret_data) { - o2hb_arm_timeout(data); + struct o2hb_region *reg = data; + + printk(KERN_NOTICE "o2hb: negotiate timeout approved by master node on region %s (%s).\n", + config_item_name(®->hr_item), reg->hr_dev_name); + o2hb_arm_timeout(reg); return 0; } From 88dbe98dc72fa0c2b778210e2079cd69d2477b36 Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:27:10 -0700 Subject: [PATCH 05/19] ocfs2: o2hb: don't negotiate if last hb fail Sometimes io error is returned when storage is down for a while. Like for iscsi device, stroage is made offline when session timeout, and this will make all io return -EIO. For this case, nodes shouldn't do negotiate timeout but should fence self. So let nodes fence self when o2hb_do_disk_heartbeat return an error, this is the same behavior with o2hb without negotiate timer. Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding Reviewed-by: Mark Fasheh Cc: Gang He Cc: rwxybh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 84ebeb5678c6..31a5c0803e0a 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -284,6 +284,9 @@ struct o2hb_region { /* Message key for negotiate timeout message. */ unsigned int hr_key; struct list_head hr_handler_list; + + /* last hb status, 0 for success, other value for error. */ + int hr_last_hb_status; }; struct o2hb_bio_wait_ctxt { @@ -395,6 +398,12 @@ static void o2hb_nego_timeout(struct work_struct *work) struct o2hb_region *reg; reg = container_of(work, struct o2hb_region, hr_nego_timeout_work.work); + /* don't negotiate timeout if last hb failed since it is very + * possible io failed. Should let write timeout fence self. + */ + if (reg->hr_last_hb_status) + return; + o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); /* lowest node as master node to make negotiate decision. */ master_node = find_next_bit(live_node_bitmap, O2NM_MAX_NODES, 0); @@ -1228,6 +1237,7 @@ static int o2hb_thread(void *data) before_hb = ktime_get_real(); ret = o2hb_do_disk_heartbeat(reg); + reg->hr_last_hb_status = ret; after_hb = ktime_get_real(); From 6633ca573165e000867e50caf94f60a75399b68b Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:27:13 -0700 Subject: [PATCH 06/19] ocfs2: o2hb: fix hb hung time hr_last_timeout_start should be set as the last time where hb is still OK. When hb write timeout, hung time will be (jiffies - hr_last_timeout_start). Signed-off-by: Junxiao Bi Reviewed-by: Ryan Ding Reviewed-by: Mark Fasheh Cc: Gang He Cc: rwxybh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/heartbeat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 31a5c0803e0a..6aaf3e351391 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -356,7 +356,6 @@ static void o2hb_arm_timeout(struct o2hb_region *reg) spin_unlock(&o2hb_live_lock); } cancel_delayed_work(®->hr_write_timeout_work); - reg->hr_last_timeout_start = jiffies; schedule_delayed_work(®->hr_write_timeout_work, msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS)); @@ -1174,6 +1173,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (own_slot_ok) { o2hb_set_quorum_device(reg); o2hb_arm_timeout(reg); + reg->hr_last_timeout_start = jiffies; } bail: From 38b52efd218bf2a11a5b4a8f56052cee6684cfec Mon Sep 17 00:00:00 2001 From: Junxiao Bi Date: Fri, 27 May 2016 14:27:16 -0700 Subject: [PATCH 07/19] ocfs2: bump up o2cb network protocol version Two new messages are added to support negotiating hb timeout. Stop nodes frmo talking an old version to mount as they will cause the negotiation to fail. Link: http://lkml.kernel.org/r/1464231615-27939-1-git-send-email-junxiao.bi@oracle.com Signed-off-by: Junxiao Bi Reviewed-by: Mark Fasheh Cc: Joel Becker Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp_internal.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..94b18369b1cc 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -44,6 +44,9 @@ * version here in tcp_internal.h should not need to be bumped for * filesystem locking changes. * + * New in version 12 + * - Negotiate hb timeout when storage is down. + * * New in version 11 * - Negotiation of filesystem locking in the dlm join. * @@ -75,7 +78,7 @@ * - full 64 bit i_size in the metadata lock lvbs * - introduction of "rw" lock and pushing meta/data locking down */ -#define O2NET_PROTOCOL_VERSION 11ULL +#define O2NET_PROTOCOL_VERSION 12ULL struct o2net_handshake { __be64 protocol_version; __be64 connector_id; From 9ecd10b7a0270803fd5f36ab93173e6d5b41b895 Mon Sep 17 00:00:00 2001 From: Eryu Guan Date: Fri, 27 May 2016 14:27:18 -0700 Subject: [PATCH 08/19] direct-io: fix direct write stale data exposure from concurrent buffered read Currently direct writes inside i_size on a DIO_SKIP_HOLES filesystem are not allowed to allocate blocks(get_more_blocks() sets 'create' to 0 before calling get_block() callback), if it's a sparse file, direct writes fall back to buffered writes to avoid stale data exposure from concurrent buffered read. But there're two cases that can result in stale data exposure are not correctly detected. 1. The detection for "writing inside i_size" is not sufficient, writes can be treated as "extending writes" wrongly. For example, direct write 1FSB (file system block) to a 1FSB sparse file on ext2/3/4, starting from offset 0, in this case it's writing inside i_size, but 'create' is non-zero, because 'block_in_file' and '(i_size_read(inode) >> blkbits' are both zero. 2. Direct writes starting from or beyong i_size (not inside i_size) also could trigger block allocation and expose stale data. For example, consider a sparse file with i_size of 2k, and a write to offset 2k or 3k into the file, with a filesystem block size of 4k. (Thanks to Jeff Moyer for pointing this case out in his review.) The first problem can be demostrated by running ltp-aiodio test ADSP045 many times. When testing on extN filesystems, I see test failures occasionally, buffered read could read non-zero (stale) data. ADSP045: dio_sparse -a 4k -w 4k -s 2k -n 1 dio_sparse 0 TINFO : Dirtying free blocks dio_sparse 0 TINFO : Starting I/O tests non zero buffer at buf[0] => 0xffffffaa,ffffffaa,ffffffaa,ffffffaa non-zero read at offset 0 dio_sparse 0 TINFO : Killing childrens(s) dio_sparse 1 TFAIL : dio_sparse.c:191: 1 children(s) exited abnormally The second problem can also be reproduced easily by a hacked dio_sparse program, which accepts an option to specify the write offset. What we should really do is to disable block allocation for writes that could result in filling holes inside i_size. Link: http://lkml.kernel.org/r/1463156728-13357-1-git-send-email-guaneryu@gmail.com Reviewed-by: Jan Kara Signed-off-by: Eryu Guan Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/direct-io.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 3bf3f20f8ecc..f3b4408be590 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -628,11 +628,11 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, map_bh->b_size = fs_count << i_blkbits; /* - * For writes inside i_size on a DIO_SKIP_HOLES filesystem we - * forbid block creations: only overwrites are permitted. - * We will return early to the caller once we see an - * unmapped buffer head returned, and the caller will fall - * back to buffered I/O. + * For writes that could fill holes inside i_size on a + * DIO_SKIP_HOLES filesystem we forbid block creations: only + * overwrites are permitted. We will return early to the caller + * once we see an unmapped buffer head returned, and the caller + * will fall back to buffered I/O. * * Otherwise the decision is left to the get_blocks method, * which may decide to handle it or also return an unmapped @@ -640,8 +640,8 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio, */ create = dio->rw & WRITE; if (dio->flags & DIO_SKIP_HOLES) { - if (sdio->block_in_file < (i_size_read(dio->inode) >> - sdio->blkbits)) + if (fs_startblk <= ((i_size_read(dio->inode) - 1) >> + i_blkbits)) create = 0; } From edd9f7230f591b7988533b1cafb07f3c03555f19 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Fri, 27 May 2016 14:27:21 -0700 Subject: [PATCH 09/19] mm: oom: do not reap task if there are live threads in threadgroup If the current process is exiting, we don't invoke oom killer, instead we give it access to memory reserves and try to reap its mm in case nobody is going to use it. There's a mistake in the code performing this check - we just ignore any process of the same thread group no matter if it is exiting or not - see try_oom_reaper. Fix it. Link: http://lkml.kernel.org/r/1464087628-7318-1-git-send-email-vdavydov@virtuozzo.com Fixes: 3ef22dfff239 ("oom, oom_reaper: try to reap tasks which skip regular OOM killer path")Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5bb2f7698ad7..326dd14938f0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -611,8 +611,6 @@ void try_oom_reaper(struct task_struct *tsk) if (!process_shares_mm(p, mm)) continue; - if (same_thread_group(p, tsk)) - continue; if (fatal_signal_pending(p)) continue; From 10540a6998de5d283e3e471f0d0f4e55e2a6872c Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Fri, 27 May 2016 14:27:24 -0700 Subject: [PATCH 10/19] MAINTAINERS: add kexec_core.c and kexec_file.c In the below commits kexec.c was split to kexec.c, kexec_file.c and kexec_core.c. commit a43cac0d9dc2 ("kexec: split kexec_file syscall code to kexec_file.c") commit 2965faa5e03d ("kexec: split kexec_load syscall from kexec core code") Both kexec_file.c and kexec_core.c still belong to the kexec component. In order to get correct mail lists by using the script get_maintainer.pl, add these files to MAINTAINERS. Link: http://lkml.kernel.org/r/1464189735-59113-1-git-send-email-mnghuan@gmail.com Signed-off-by: Minfei Huang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 81e9c984d2f3..e8c17cc8fb01 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6559,7 +6559,7 @@ L: kexec@lists.infradead.org S: Maintained F: include/linux/kexec.h F: include/uapi/linux/kexec.h -F: kernel/kexec.c +F: kernel/kexec* KEYS/KEYRINGS: M: David Howells From f871f191357ebadb6ad7e8ecf6e8f40b694eea00 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Fri, 27 May 2016 14:27:27 -0700 Subject: [PATCH 11/19] MAINTAINERS: Kdump maintainers update I am proposing following updates to kdump maintainership. I have got busy in other things and not getting time to spend on kdump. Remove Haren Myneni as he has not participated in kdump development for a long time now. Add the names of Dave and Baoquan as kdump maintainers as they have been contributing to kdump for a long time now and they are in a much better position to spend time on this than me. Mark myself as a reviewer. Link: http://lkml.kernel.org/r/20160525131616.GB27291@redhat.com Signed-off-by: Vivek Goyal Acked-by: Simon Horman Cc: Haren Myneni Cc: Dave Young Cc: Baoquan He Cc: "Eric W. Biederman" Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index e8c17cc8fb01..ce93aa9a1413 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -6413,8 +6413,9 @@ F: Documentation/kbuild/kconfig-language.txt F: scripts/kconfig/ KDUMP -M: Vivek Goyal -M: Haren Myneni +M: Dave Young +M: Baoquan He +R: Vivek Goyal L: kexec@lists.infradead.org W: http://lse.sourceforge.net/kdump/ S: Maintained From fe53ca54270a757f0a28ee6bf3a54d952b550ed0 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 27 May 2016 14:27:30 -0700 Subject: [PATCH 12/19] mm: use early_pfn_to_nid in page_ext_init page_ext_init() checks suitable pages with pfn_to_nid(), but pfn_to_nid() depends on memmap which will not be setup fully until page_alloc_init_late() is done. Use early_pfn_to_nid() instead of pfn_to_nid() so that page extension could be still used early even though CONFIG_ DEFERRED_STRUCT_PAGE_INIT is enabled and catch early page allocation call sites. Suggested by Joonsoo Kim [1], this fix basically undoes the change introduced by commit b8f1a75d61d840 ("mm: call page_ext_init() after all struct pages are initialized") and fixes the same problem with a better approach. [1] http://lkml.kernel.org/r/CAAmzW4OUmyPwQjvd7QUfc6W1Aic__TyAuH80MLRZNMxKy0-wPQ@mail.gmail.com Link: http://lkml.kernel.org/r/1464198689-23458-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/main.c | 3 +-- mm/page_ext.c | 4 +++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/init/main.c b/init/main.c index bc0f9e0bcf22..4c17fda5c2ff 100644 --- a/init/main.c +++ b/init/main.c @@ -607,6 +607,7 @@ asmlinkage __visible void __init start_kernel(void) initrd_start = 0; } #endif + page_ext_init(); debug_objects_mem_init(); kmemleak_init(); setup_per_cpu_pageset(); @@ -1003,8 +1004,6 @@ static noinline void __init kernel_init_freeable(void) sched_init_smp(); page_alloc_init_late(); - /* Initialize page ext after all struct pages are initializaed */ - page_ext_init(); do_basic_setup(); diff --git a/mm/page_ext.c b/mm/page_ext.c index 2d864e64f7fe..44a4c029c8e7 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -390,8 +390,10 @@ void __init page_ext_init(void) * We know some arch can have a nodes layout such as * -------------pfn--------------> * N0 | N1 | N2 | N0 | N1 | N2|.... + * + * Take into account DEFERRED_STRUCT_PAGE_INIT. */ - if (pfn_to_nid(pfn) != nid) + if (early_pfn_to_nid(pfn) != nid) continue; if (init_section_page_ext(pfn, nid)) goto oom; From f65e91df25aa426289cbcb580ca3183e24979fb1 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 27 May 2016 14:27:32 -0700 Subject: [PATCH 13/19] mm: use early_pfn_to_nid in register_page_bootmem_info_node register_page_bootmem_info_node() is invoked in mem_init(), so it will be called before page_alloc_init_late() if DEFERRED_STRUCT_PAGE_INIT is enabled. But, pfn_to_nid() depends on memmap which won't be fully setup until page_alloc_init_late() is done, so replace pfn_to_nid() by early_pfn_to_nid(). Link: http://lkml.kernel.org/r/1464210007-30930-1-git-send-email-yang.shi@linaro.org Signed-off-by: Yang Shi Cc: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index caf2a14c37ad..b8ee0806415f 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -300,7 +300,7 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) * multiple nodes we check that this pfn does not already * reside in some other nodes. */ - if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) + if (pfn_valid(pfn) && (early_pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); } } From e2fe14564d3316d1625ed20bf1083995f4960893 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 27 May 2016 14:27:35 -0700 Subject: [PATCH 14/19] oom_reaper: close race with exiting task Tetsuo has reported: Out of memory: Kill process 443 (oleg's-test) score 855 or sacrifice child Killed process 443 (oleg's-test) total-vm:493248kB, anon-rss:423880kB, file-rss:4kB, shmem-rss:0kB sh invoked oom-killer: gfp_mask=0x24201ca(GFP_HIGHUSER_MOVABLE|__GFP_COLD), order=0, oom_score_adj=0 sh cpuset=/ mems_allowed=0 CPU: 2 PID: 1 Comm: sh Not tainted 4.6.0-rc7+ #51 Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/31/2013 Call Trace: dump_stack+0x85/0xc8 dump_header+0x5b/0x394 oom_reaper: reaped process 443 (oleg's-test), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB In other words: __oom_reap_task exit_mm atomic_inc_not_zero tsk->mm = NULL mmput atomic_dec_and_test # > 0 exit_oom_victim # New victim will be # selected # no TIF_MEMDIE task so we can select a new one unmap_page_range # to release the memory The race exists even without the oom_reaper because anybody who pins the address space and gets preempted might race with exit_mm but oom_reaper made this race more probable. We can address the oom_reaper part by using oom_lock for __oom_reap_task because this would guarantee that a new oom victim will not be selected if the oom reaper might race with the exit path. This doesn't solve the original issue, though, because somebody else still might be pinning mm_users and so __mmput won't be called to release the memory but that is not really realiably solvable because the task will get away from the oom sight as soon as it is unhashed from the task_list and so we cannot guarantee a new victim won't be selected. [akpm@linux-foundation.org: fix use of unused `mm', Per Stephen] [akpm@linux-foundation.org: coding-style fixes] Fixes: aac453635549 ("mm, oom: introduce oom reaper") Link: http://lkml.kernel.org/r/1464271493-20008-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 30 ++++++++++++++++++++++++------ 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 326dd14938f0..dfb1ab61fb23 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -443,12 +443,28 @@ static bool __oom_reap_task(struct task_struct *tsk) { struct mmu_gather tlb; struct vm_area_struct *vma; - struct mm_struct *mm; + struct mm_struct *mm = NULL; struct task_struct *p; struct zap_details details = {.check_swap_entries = true, .ignore_dirty = true}; bool ret = true; + /* + * We have to make sure to not race with the victim exit path + * and cause premature new oom victim selection: + * __oom_reap_task exit_mm + * atomic_inc_not_zero + * mmput + * atomic_dec_and_test + * exit_oom_victim + * [...] + * out_of_memory + * select_bad_process + * # no TIF_MEMDIE task selects new victim + * unmap_page_range # frees some memory + */ + mutex_lock(&oom_lock); + /* * Make sure we find the associated mm_struct even when the particular * thread has already terminated and cleared its mm. @@ -457,19 +473,19 @@ static bool __oom_reap_task(struct task_struct *tsk) */ p = find_lock_task_mm(tsk); if (!p) - return true; + goto unlock_oom; mm = p->mm; if (!atomic_inc_not_zero(&mm->mm_users)) { task_unlock(p); - return true; + goto unlock_oom; } task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { ret = false; - goto out; + goto unlock_oom; } tlb_gather_mmu(&tlb, mm, 0, -1); @@ -511,13 +527,15 @@ static bool __oom_reap_task(struct task_struct *tsk) * to release its memory. */ set_bit(MMF_OOM_REAPED, &mm->flags); -out: +unlock_oom: + mutex_unlock(&oom_lock); /* * Drop our reference but make sure the mmput slow path is called from a * different context because we shouldn't risk we get stuck there and * put the oom_reaper out of the way. */ - mmput_async(mm); + if (mm) + mmput_async(mm); return ret; } From 0798d3c022dc63eb0ec02b511e1f76ca8411ef8e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 27 May 2016 14:27:38 -0700 Subject: [PATCH 15/19] mm: thp: avoid false positive VM_BUG_ON_PAGE in page_move_anon_rmap() If page_move_anon_rmap() is refiling a pmd-splitted THP mapped in a tail page from a pte, the "address" must be THP aligned in order for the page->index bugcheck to pass in the CONFIG_DEBUG_VM=y builds. Link: http://lkml.kernel.org/r/1464253620-106404-1-git-send-email-kirill.shutemov@linux.intel.com Fixes: 6d0a07edd17c ("mm: thp: calculate the mapcount correctly for THP pages during WP faults") Signed-off-by: Kirill A. Shutemov Reported-by: Mika Westerberg Tested-by: Mika Westerberg Reviewed-by: Andrea Arcangeli Cc: [4.5] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/rmap.c b/mm/rmap.c index 8a839935b18c..0ea5d9071b32 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1098,6 +1098,8 @@ void page_move_anon_rmap(struct page *page, VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_VMA(!anon_vma, vma); + if (IS_ENABLED(CONFIG_DEBUG_VM) && PageTransHuge(page)) + address &= HPAGE_PMD_MASK; VM_BUG_ON_PAGE(page->index != linear_page_index(vma, address), page); anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; From badbda53e505089062e194c614e6f23450bc98b2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Fri, 27 May 2016 14:27:41 -0700 Subject: [PATCH 16/19] mm/cma: silence warnings due to max() usage pageblock_order can be (at least) an unsigned int or an unsigned long depending on the kernel config and architecture, so use max_t(unsigned long, ...) when comparing it. fixes these warnings: In file included from include/asm-generic/bug.h:13:0, from arch/powerpc/include/asm/bug.h:127, from include/linux/bug.h:4, from include/linux/mmdebug.h:4, from include/linux/mm.h:8, from include/linux/memblock.h:18, from mm/cma.c:28: mm/cma.c: In function 'cma_init_reserved_mem': include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast (void) (&_max1 == &_max2); ^ mm/cma.c:186:27: note: in expansion of macro 'max' alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); ^ mm/cma.c: In function 'cma_declare_contiguous': include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast (void) (&_max1 == &_max2); ^ include/linux/kernel.h:747:9: note: in definition of macro 'max' typeof(y) _max2 = (y); ^ mm/cma.c:270:29: note: in expansion of macro 'max' (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); ^ include/linux/kernel.h:748:17: warning: comparison of distinct pointer types lacks a cast (void) (&_max1 == &_max2); ^ include/linux/kernel.h:747:21: note: in definition of macro 'max' typeof(y) _max2 = (y); ^ mm/cma.c:270:29: note: in expansion of macro 'max' (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); ^ [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20160526150748.5be38a4f@canb.auug.org.au Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index ea506eb18cd6..bd0e1412475e 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -183,7 +183,8 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, return -EINVAL; /* ensure minimal alignment required by mm core */ - alignment = PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order); + alignment = PAGE_SIZE << + max_t(unsigned long, MAX_ORDER - 1, pageblock_order); /* alignment should be aligned with order_per_bit */ if (!IS_ALIGNED(alignment >> PAGE_SHIFT, 1 << order_per_bit)) @@ -266,8 +267,8 @@ int __init cma_declare_contiguous(phys_addr_t base, * migratetype page by page allocator's buddy algorithm. In the case, * you couldn't get a contiguous memory, which is not what we want. */ - alignment = max(alignment, - (phys_addr_t)PAGE_SIZE << max(MAX_ORDER - 1, pageblock_order)); + alignment = max(alignment, (phys_addr_t)PAGE_SIZE << + max_t(unsigned long, MAX_ORDER - 1, pageblock_order)); base = ALIGN(base, alignment); size = ALIGN(size, alignment); limit &= ~(alignment - 1); From cbedbac3e66121ddbac363776c23119f8eaeefda Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 27 May 2016 14:27:43 -0700 Subject: [PATCH 17/19] mm/memcontrol.c: fix the margin computation in mem_cgroup_margin() mem_cgroup_margin() might return (memory.limit - memory_count) when the memsw.limit is in excess. This doesn't happen usually because we do not allow excess on hard limits and (memory.limit <= memsw.limit), but __GFP_NOFAIL charges can force the charge and cause the excess when no memory is really swappable (swap is full or no anonymous memory is left). [mhocko@suse.com: rewrote changelog] Link: http://lkml.kernel.org/r/20160525155122.GK20132@dhcp22.suse.cz Link: http://lkml.kernel.org/r/1464068266-27736-1-git-send-email-roy.qing.li@gmail.com Signed-off-by: Li RongQing Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f6477a9dbe7a..485c688a7fa8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1108,6 +1108,8 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) limit = READ_ONCE(memcg->memsw.limit); if (count <= limit) margin = min(margin, limit - count); + else + margin = 0; } return margin; From 7cf7806ce1e30f1691cf340f70b807acbdf419ef Mon Sep 17 00:00:00 2001 From: Li RongQing Date: Fri, 27 May 2016 14:27:46 -0700 Subject: [PATCH 18/19] mm/memcontrol.c: move comments for get_mctgt_type() to proper position Move the comments for get_mctgt_type() to be before get_mctgt_type() implementation. Link: http://lkml.kernel.org/r/1463644638-7446-1-git-send-email-roy.qing.li@gmail.com Signed-off-by: Li RongQing Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 485c688a7fa8..925b431f3f03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4309,24 +4309,6 @@ static int mem_cgroup_do_precharge(unsigned long count) return 0; } -/** - * get_mctgt_type - get target type of moving charge - * @vma: the vma the pte to be checked belongs - * @addr: the address corresponding to the pte to be checked - * @ptent: the pte to be checked - * @target: the pointer the target page or swap ent will be stored(can be NULL) - * - * Returns - * 0(MC_TARGET_NONE): if the pte is not a target for move charge. - * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for - * move charge. if @target is not NULL, the page is stored in target->page - * with extra refcnt got(Callers should handle it). - * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a - * target for charge migration. if @target is not NULL, the entry is stored - * in target->ent. - * - * Called with pte lock held. - */ union mc_target { struct page *page; swp_entry_t ent; @@ -4515,6 +4497,25 @@ out: return ret; } +/** + * get_mctgt_type - get target type of moving charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * + * Called with pte lock held. + */ + static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, unsigned long addr, pte_t ptent, union mc_target *target) { From 11e685672a0861ce136cc4e7f6fdd11e5390b1fa Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Fri, 27 May 2016 14:27:49 -0700 Subject: [PATCH 19/19] mm: disable DEFERRED_STRUCT_PAGE_INIT on !NO_BOOTMEM When we have !NO_BOOTMEM, the deferred page struct initialization doesn't work well because the pages reserved in bootmem are released to the page allocator uncoditionally. It causes memory corruption and system crash eventually. As Mel suggested, the bootmem is retiring slowly. We fix the issue by simply hiding DEFERRED_STRUCT_PAGE_INIT when bootmem is enabled. Link: http://lkml.kernel.org/r/1460602170-5821-1-git-send-email-gwshan@linux.vnet.ibm.com Signed-off-by: Gavin Shan Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/Kconfig b/mm/Kconfig index 22fa8189e4fc..3e2daef3c946 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -648,7 +648,7 @@ config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on MEMORY_HOTPLUG + depends on NO_BOOTMEM && MEMORY_HOTPLUG depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a