From 192efb7a1f9b69ce2ec1212ee8c24fb9b4a80a35 Mon Sep 17 00:00:00 2001 From: Maninder Singh Date: Wed, 15 Nov 2017 17:31:14 -0800 Subject: [PATCH 001/131] bloat-o-meter: provide 3 different arguments for data, function and All This patch provides 3 new arguments for bloat-o-meter 1) -c -> for all (showing function and data differently) 2) -d -> data 3) -t -> function output: ./scripts/bloat-o-meter -c "file1" "file2" add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-152 (-152) Function old new delta main 412 260 -152 Total: Before=548, After=396, chg -27.74% ########################################################## add/remove: 1/0 grow/shrink: 1/0 up/down: 84/0 (84) Data old new delta arr - 64 +64 backtrace 60 80 +20 Total: Before=109, After=193, chg +77.06% ########################################################## add/remove: 0/1 grow/shrink: 0/0 up/down: 0/-64 (-64) RO Data old new delta arr 64 - -64 Total: Before=68, After=4, chg -94.12% [maninder1.s@samsung.com: v1 -> v2] Link: http://lkml.kernel.org/r/1506569402-24787-1-git-send-email-maninder1.s@samsung.com Link: http://lkml.kernel.org/r/1506336313-27187-1-git-send-email-maninder1.s@samsung.com Signed-off-by: Vaneet Narang Signed-off-by: Maninder Singh Cc: Amit Sahrawat Cc: Andi Kleen Cc: Michal Marek Cc: Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/bloat-o-meter | 89 ++++++++++++++++++++++++++----------------- 1 file changed, 55 insertions(+), 34 deletions(-) diff --git a/scripts/bloat-o-meter b/scripts/bloat-o-meter index a27677146410..6f099f915dcf 100755 --- a/scripts/bloat-o-meter +++ b/scripts/bloat-o-meter @@ -12,18 +12,22 @@ from signal import signal, SIGPIPE, SIG_DFL signal(SIGPIPE, SIG_DFL) -if len(sys.argv) != 3: - sys.stderr.write("usage: %s file1 file2\n" % sys.argv[0]) +if len(sys.argv) < 3: + sys.stderr.write("usage: %s [option] file1 file2\n" % sys.argv[0]) + sys.stderr.write("The options are:\n") + sys.stderr.write("-c cateogrize output based on symbole type\n") + sys.stderr.write("-d Show delta of Data Section\n") + sys.stderr.write("-t Show delta of text Section\n") sys.exit(-1) re_NUMBER = re.compile(r'\.[0-9]+') -def getsizes(file): +def getsizes(file, format): sym = {} with os.popen("nm --size-sort " + file) as f: for line in f: size, type, name = line.split() - if type in "tTdDbBrR": + if type in format: # strip generated symbols if name.startswith("__mod_"): continue if name.startswith("SyS_"): continue @@ -34,44 +38,61 @@ def getsizes(file): sym[name] = sym.get(name, 0) + int(size, 16) return sym -old = getsizes(sys.argv[1]) -new = getsizes(sys.argv[2]) -grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0 -delta, common = [], {} -otot, ntot = 0, 0 +def calc(oldfile, newfile, format): + old = getsizes(oldfile, format) + new = getsizes(newfile, format) + grow, shrink, add, remove, up, down = 0, 0, 0, 0, 0, 0 + delta, common = [], {} + otot, ntot = 0, 0 -for a in old: - if a in new: - common[a] = 1 + for a in old: + if a in new: + common[a] = 1 -for name in old: - otot += old[name] - if name not in common: - remove += 1 - down += old[name] - delta.append((-old[name], name)) + for name in old: + otot += old[name] + if name not in common: + remove += 1 + down += old[name] + delta.append((-old[name], name)) -for name in new: - ntot += new[name] - if name not in common: - add += 1 - up += new[name] - delta.append((new[name], name)) + for name in new: + ntot += new[name] + if name not in common: + add += 1 + up += new[name] + delta.append((new[name], name)) -for name in common: + for name in common: d = new.get(name, 0) - old.get(name, 0) if d>0: grow, up = grow+1, up+d if d<0: shrink, down = shrink+1, down-d delta.append((d, name)) -delta.sort() -delta.reverse() + delta.sort() + delta.reverse() + return grow, shrink, add, remove, up, down, delta, old, new, otot, ntot -print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ - (add, remove, grow, shrink, up, -down, up-down)) -print("%-40s %7s %7s %+7s" % ("function", "old", "new", "delta")) -for d, n in delta: - if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) +def print_result(symboltype, symbolformat, argc): + grow, shrink, add, remove, up, down, delta, old, new, otot, ntot = \ + calc(sys.argv[argc - 1], sys.argv[argc], symbolformat) -print("Total: Before=%d, After=%d, chg %+.2f%%" % \ - (otot, ntot, (ntot - otot)*100.0/otot)) + print("add/remove: %s/%s grow/shrink: %s/%s up/down: %s/%s (%s)" % \ + (add, remove, grow, shrink, up, -down, up-down)) + print("%-40s %7s %7s %+7s" % (symboltype, "old", "new", "delta")) + for d, n in delta: + if d: print("%-40s %7s %7s %+7d" % (n, old.get(n,"-"), new.get(n,"-"), d)) + + print("Total: Before=%d, After=%d, chg %+.2f%%" % \ + (otot, ntot, (ntot - otot)*100.0/otot)) + +if sys.argv[1] == "-c": + print_result("Function", "tT", 3) + print_result("Data", "dDbB", 3) + print_result("RO Data", "rR", 3) +elif sys.argv[1] == "-d": + print_result("Data", "dDbBrR", 3) +elif sys.argv[1] == "-t": + print_result("Function", "tT", 3) +else: + print_result("Function", "tTdDbBrR", 2) From c95f121142a41c951fd62683a5574f2a7b37c573 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 15 Nov 2017 17:31:18 -0800 Subject: [PATCH 002/131] m32r: fix endianness constraints The m32r Kconfig provides both CPU_BIG_ENDIAN and CPU_LITTLE_ENDIAN configuration options. As they are user-selectable and independent, this allows invalid configurations: - All m32r defconfigs build a big endian kernel, but CPU_BIG_ENDIAN is not set, causing compiler warnings like: include/linux/byteorder/big_endian.h:7:2: warning: #warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN [-Wcpp] #warning inconsistent configuration, needs CONFIG_CPU_BIG_ENDIAN ^ - Since commit 5bdfca6435b82944 ("m32r: define CPU_BIG_ENDIAN"), building an allmodconfig or allyesconfig enables both CONFIG_CPU_BIG_ENDIAN and CONFIG_CPU_LITTLE_ENDIAN. While this did get rid of the warning above, both options are obviously mutually exclusive. Fix this by making only CPU_LITTLE_ENDIAN configurable by the user, as before, and by making sure exactly one of CPU_BIG_ENDIAN and CPU_LITTLE_ENDIAN is always enabled. Link: http://lkml.kernel.org/r/1509361505-18150-1-git-send-email-geert@linux-m68k.org Fixes: 5bdfca6435b82944 ("m32r: define CPU_BIG_ENDIAN") Signed-off-by: Geert Uytterhoeven Cc: Sudip Mukherjee Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 0d9446c37ae8..498398d915c1 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -196,8 +196,8 @@ config TIMER_DIVIDE default "128" config CPU_BIG_ENDIAN - bool "Generate big endian code" - default n + bool + default !CPU_LITTLE_ENDIAN config CPU_LITTLE_ENDIAN bool "Generate little endian code" From a60874f85818aa2079fb5aefb4b32818ed2aba7d Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Wed, 15 Nov 2017 17:31:21 -0800 Subject: [PATCH 003/131] ocfs2: remove unused declaration ocfs2_publish_get_mount_state() Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4D0743232@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Acked-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/super.h b/fs/ocfs2/super.h index b023e4f3d740..d4550c8bbc41 100644 --- a/fs/ocfs2/super.h +++ b/fs/ocfs2/super.h @@ -26,9 +26,6 @@ #ifndef OCFS2_SUPER_H #define OCFS2_SUPER_H -int ocfs2_publish_get_mount_state(struct ocfs2_super *osb, - int node_num); - __printf(3, 4) int __ocfs2_error(struct super_block *sb, const char *function, const char *fmt, ...); From 23e0813a0807754b5be18688b7564c58e0eb2f2a Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 15 Nov 2017 17:31:25 -0800 Subject: [PATCH 004/131] ocfs2: no need flush workqueue before destroying it destroy_workqueue() will do flushing work for us. Link: http://lkml.kernel.org/r/59E06476.3090502@huawei.com Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmdomain.c | 1 - fs/ocfs2/dlmfs/dlmfs.c | 1 - fs/ocfs2/super.c | 4 +--- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index a2b19fbdcf46..e1fea149f50b 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -394,7 +394,6 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) { if (dlm->dlm_worker) { - flush_workqueue(dlm->dlm_worker); destroy_workqueue(dlm->dlm_worker); dlm->dlm_worker = NULL; } diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 9ab9e1892b5f..edce7b5f1604 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -670,7 +670,6 @@ static void __exit exit_dlmfs_fs(void) { unregister_filesystem(&dlmfs_fs_type); - flush_workqueue(user_dlm_worker); destroy_workqueue(user_dlm_worker); /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80733496b22a..040bbb6a6e4b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2521,10 +2521,8 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb) /* This function assumes that the caller has the main osb resource */ /* ocfs2_initializer_super have already created this workqueue */ - if (osb->ocfs2_wq) { - flush_workqueue(osb->ocfs2_wq); + if (osb->ocfs2_wq) destroy_workqueue(osb->ocfs2_wq); - } ocfs2_free_slot_info(osb); From 98d6c09ec2899a9a601b16ec7ae31d54e6b100b9 Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 15 Nov 2017 17:31:29 -0800 Subject: [PATCH 005/131] ocfs2: cleanup unused func declaration and assignment Link: http://lkml.kernel.org/r/59E064BB.8000005@huawei.com Signed-off-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 2 -- fs/ocfs2/cluster/heartbeat.h | 2 -- 2 files changed, 4 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index addd7c5f2d3e..ab5105f9767e 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -3585,8 +3585,6 @@ static int ocfs2_merge_rec_left(struct ocfs2_path *right_path, * The easy case - we can just plop the record right in. */ *left_rec = *split_rec; - - has_empty_extent = 0; } else le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 3ef5137dc362..a9e67efc0004 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -79,10 +79,8 @@ void o2hb_fill_node_map(unsigned long *map, unsigned bytes); void o2hb_exit(void); int o2hb_init(void); -int o2hb_check_node_heartbeating(u8 node_num); int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); -int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); int o2hb_get_all_regions(char *region_uuids, u8 numregions); int o2hb_global_heartbeat_active(void); From 1c01967116a678fed8e2c68a6ab82abc8effeddc Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 15 Nov 2017 17:31:33 -0800 Subject: [PATCH 006/131] ocfs2: fix cluster hang after a node dies When a node dies, other live nodes have to choose a new master for an existed lock resource mastered by the dead node. As for ocfs2/dlm implementation, this is done by function - dlm_move_lockres_to_recovery_list which marks those lock rsources as DLM_LOCK_RES_RECOVERING and manages them via a list from which DLM changes lock resource's master later. So without invoking dlm_move_lockres_to_recovery_list, no master will be choosed after dlm recovery accomplishment since no lock resource can be found through ::resource list. What's worse is that if DLM_LOCK_RES_RECOVERING is not marked for lock resources mastered a dead node, it will break up synchronization among nodes. So invoke dlm_move_lockres_to_recovery_list again. Fixs: 'commit ee8f7fcbe638 ("ocfs2/dlm: continue to purge recovery lockres when recovery master goes down")' Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373CED6E0F9@H3CMLB14-EX.srv.huawei-3com.com Signed-off-by: Changwei Ge Reported-by: Vitaly Mayatskih Tested-by: Vitaly Mayatskikh Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmrecovery.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 74407c6dd592..ec8f75813beb 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2419,6 +2419,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) dlm_lockres_put(res); continue; } + dlm_move_lockres_to_recovery_list(dlm, res); } else if (res->owner == dlm->node_num) { dlm_free_dead_locks(dlm, res, dead_node); __dlm_lockres_calc_usage(dlm, res); From 67b1b8d14aa08aa8df835377074ed629629bda50 Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 15 Nov 2017 17:31:37 -0800 Subject: [PATCH 007/131] ocfs2: clean up some unused function declarations Link: http://lkml.kernel.org/r/59C5D7D6.9050106@huawei.com Signed-off-by: Jun Piao Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/buffer_head_io.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index b97bcc6dde7c..b1bb70c8ca4d 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -28,9 +28,6 @@ #include -void ocfs2_end_buffer_io_sync(struct buffer_head *bh, - int uptodate); - int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct ocfs2_caching_info *ci); From 28f5a8a7c033cbf3e32277f4cc9c6afd74f05300 Mon Sep 17 00:00:00 2001 From: alex chen Date: Wed, 15 Nov 2017 17:31:40 -0800 Subject: [PATCH 008/131] ocfs2: should wait dio before inode lock in ocfs2_setattr() we should wait dio requests to finish before inode lock in ocfs2_setattr(), otherwise the following deadlock will happen: process 1 process 2 process 3 truncate file 'A' end_io of writing file 'A' receiving the bast messages ocfs2_setattr ocfs2_inode_lock_tracker ocfs2_inode_lock_full inode_dio_wait __inode_dio_wait -->waiting for all dio requests finish dlm_proxy_ast_handler dlm_do_local_bast ocfs2_blocking_ast ocfs2_generic_handle_bast set OCFS2_LOCK_BLOCKED flag dio_end_io dio_bio_end_aio dio_complete ocfs2_dio_end_io ocfs2_dio_end_io_write ocfs2_inode_lock __ocfs2_cluster_lock ocfs2_wait_for_mask -->waiting for OCFS2_LOCK_BLOCKED flag to be cleared, that is waiting for 'process 1' unlocking the inode lock inode_dio_end -->here dec the i_dio_count, but will never be called, so a deadlock happened. Link: http://lkml.kernel.org/r/59F81636.70508@huawei.com Signed-off-by: Alex Chen Reviewed-by: Jun Piao Reviewed-by: Joseph Qi Acked-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6e41fc8fabbe..dc455d45a66a 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1161,6 +1161,13 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; if (size_change) { + /* + * Here we should wait dio to finish before inode lock + * to avoid a deadlock between ocfs2_setattr() and + * ocfs2_dio_end_io_write() + */ + inode_dio_wait(inode); + status = ocfs2_rw_lock(inode, 1); if (status < 0) { mlog_errno(status); @@ -1200,8 +1207,6 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) if (status) goto bail_unlock; - inode_dio_wait(inode); - if (i_size_read(inode) >= attr->ia_size) { if (ocfs2_should_order_data(inode)) { status = ocfs2_begin_ordered_truncate(inode, From 3e4c56d41eef5595035872a2ec5a483f42e8917f Mon Sep 17 00:00:00 2001 From: alex chen Date: Wed, 15 Nov 2017 17:31:44 -0800 Subject: [PATCH 009/131] ocfs2: ip_alloc_sem should be taken in ocfs2_get_block() ip_alloc_sem should be taken in ocfs2_get_block() when reading file in DIRECT mode to prevent concurrent access to extent tree with ocfs2_dio_end_io_write(), which may cause BUGON in the following situation: read file 'A' end_io of writing file 'A' vfs_read __vfs_read ocfs2_file_read_iter generic_file_read_iter ocfs2_direct_IO __blockdev_direct_IO do_blockdev_direct_IO do_direct_IO get_more_blocks ocfs2_get_block ocfs2_extent_map_get_blocks ocfs2_get_clusters ocfs2_get_clusters_nocache() ocfs2_search_extent_list return the index of record which contains the v_cluster, that is v_cluster > rec[i]->e_cpos. ocfs2_dio_end_io ocfs2_dio_end_io_write down_write(&oi->ip_alloc_sem); ocfs2_mark_extent_written ocfs2_change_extent_flag ocfs2_split_extent ... --> modify the rec[i]->e_cpos, resulting in v_cluster < rec[i]->e_cpos. BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)) [alex.chen@huawei.com: v3] Link: http://lkml.kernel.org/r/59EF3614.6050008@huawei.com Link: http://lkml.kernel.org/r/59EF3614.6050008@huawei.com Fixes: c15471f79506 ("ocfs2: fix sparse file & data ordering issue in direct io") Signed-off-by: Alex Chen Reviewed-by: Jun Piao Reviewed-by: Joseph Qi Reviewed-by: Gang He Acked-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 88a31e9340a0..d1516327b787 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -134,6 +134,19 @@ bail: return err; } +static int ocfs2_lock_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + int ret = 0; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + down_read(&oi->ip_alloc_sem); + ret = ocfs2_get_block(inode, iblock, bh_result, create); + up_read(&oi->ip_alloc_sem); + + return ret; +} + int ocfs2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -2128,7 +2141,7 @@ static void ocfs2_dio_free_write_ctx(struct inode *inode, * called like this: dio->get_blocks(dio->inode, fs_startblk, * fs_count, map_bh, dio->rw == WRITE); */ -static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, +static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -2154,12 +2167,9 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock, * while file size will be changed. */ if (pos + total_len <= i_size_read(inode)) { - down_read(&oi->ip_alloc_sem); + /* This is the fast path for re-write. */ - ret = ocfs2_get_block(inode, iblock, bh_result, create); - - up_read(&oi->ip_alloc_sem); - + ret = ocfs2_lock_get_block(inode, iblock, bh_result, create); if (buffer_mapped(bh_result) && !buffer_new(bh_result) && ret == 0) @@ -2424,9 +2434,9 @@ static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter) return 0; if (iov_iter_rw(iter) == READ) - get_block = ocfs2_get_block; + get_block = ocfs2_lock_get_block; else - get_block = ocfs2_dio_get_block; + get_block = ocfs2_dio_wr_get_block; return __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, get_block, From 853bc26a7ea39e354b9f8889ae7ad1492ffa28d2 Mon Sep 17 00:00:00 2001 From: alex chen Date: Wed, 15 Nov 2017 17:31:48 -0800 Subject: [PATCH 010/131] ocfs2: subsystem.su_mutex is required while accessing the item->ci_parent The subsystem.su_mutex is required while accessing the item->ci_parent, otherwise, NULL pointer dereference to the item->ci_parent will be triggered in the following situation: add node delete node sys_write vfs_write configfs_write_file o2nm_node_store o2nm_node_local_write do_rmdir vfs_rmdir configfs_rmdir mutex_lock(&subsys->su_mutex); unlink_obj item->ci_group = NULL; item->ci_parent = NULL; to_o2nm_cluster_from_node node->nd_item.ci_parent->ci_parent BUG since of NULL pointer dereference to nd_item.ci_parent Moreover, the o2nm_cluster also should be protected by the subsystem.su_mutex. [alex.chen@huawei.com: v2] Link: http://lkml.kernel.org/r/59EEAA69.9080703@huawei.com Link: http://lkml.kernel.org/r/59E9B36A.10700@huawei.com Signed-off-by: Alex Chen Reviewed-by: Jun Piao Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/nodemanager.c | 63 +++++++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 8 deletions(-) diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index a51200ece93d..da64c3a20eeb 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -40,6 +40,9 @@ char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { "panic", /* O2NM_FENCE_PANIC */ }; +static inline void o2nm_lock_subsystem(void); +static inline void o2nm_unlock_subsystem(void); + struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { struct o2nm_node *node = NULL; @@ -181,7 +184,10 @@ static struct o2nm_cluster *to_o2nm_cluster_from_node(struct o2nm_node *node) { /* through the first node_set .parent * mycluster/nodes/mynode == o2nm_cluster->o2nm_node_group->o2nm_node */ - return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + if (node->nd_item.ci_parent) + return to_o2nm_cluster(node->nd_item.ci_parent->ci_parent); + else + return NULL; } enum { @@ -194,7 +200,7 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; int ret = 0; @@ -214,6 +220,13 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + write_lock(&cluster->cl_nodes_lock); if (cluster->cl_nodes[tmp]) ret = -EEXIST; @@ -226,6 +239,8 @@ static ssize_t o2nm_node_num_store(struct config_item *item, const char *page, set_bit(tmp, cluster->cl_nodes_bitmap); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -269,7 +284,7 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; int ret, i; struct rb_node **p, *parent; unsigned int octets[4]; @@ -286,6 +301,13 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, be32_add_cpu(&ipv4_addr, octets[i] << (i * 8)); } + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + o2nm_unlock_subsystem(); + return -EINVAL; + } + ret = 0; write_lock(&cluster->cl_nodes_lock); if (o2nm_node_ip_tree_lookup(cluster, ipv4_addr, &p, &parent)) @@ -298,6 +320,8 @@ static ssize_t o2nm_node_ipv4_address_store(struct config_item *item, rb_insert_color(&node->nd_ip_node, &cluster->cl_node_ip_tree); } write_unlock(&cluster->cl_nodes_lock); + o2nm_unlock_subsystem(); + if (ret) return ret; @@ -315,7 +339,7 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, size_t count) { struct o2nm_node *node = to_o2nm_node(item); - struct o2nm_cluster *cluster = to_o2nm_cluster_from_node(node); + struct o2nm_cluster *cluster; unsigned long tmp; char *p = (char *)page; ssize_t ret; @@ -333,17 +357,26 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, !test_bit(O2NM_NODE_ATTR_PORT, &node->nd_set_attributes)) return -EINVAL; /* XXX */ + o2nm_lock_subsystem(); + cluster = to_o2nm_cluster_from_node(node); + if (!cluster) { + ret = -EINVAL; + goto out; + } + /* the only failure case is trying to set a new local node * when a different one is already set */ if (tmp && tmp == cluster->cl_has_local && - cluster->cl_local_node != node->nd_num) - return -EBUSY; + cluster->cl_local_node != node->nd_num) { + ret = -EBUSY; + goto out; + } /* bring up the rx thread if we're setting the new local node. */ if (tmp && !cluster->cl_has_local) { ret = o2net_start_listening(node); if (ret) - return ret; + goto out; } if (!tmp && cluster->cl_has_local && @@ -358,7 +391,11 @@ static ssize_t o2nm_node_local_store(struct config_item *item, const char *page, cluster->cl_local_node = node->nd_num; } - return count; + ret = count; + +out: + o2nm_unlock_subsystem(); + return ret; } CONFIGFS_ATTR(o2nm_node_, num); @@ -738,6 +775,16 @@ static struct o2nm_cluster_group o2nm_cluster_group = { }, }; +static inline void o2nm_lock_subsystem(void) +{ + mutex_lock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + +static inline void o2nm_unlock_subsystem(void) +{ + mutex_unlock(&o2nm_cluster_group.cs_subsys.su_mutex); +} + int o2nm_depend_item(struct config_item *item) { return configfs_depend_item(&o2nm_cluster_group.cs_subsys, item); From 3db409fa24f6dbeb60211dd829954102845c4e82 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 15 Nov 2017 17:31:52 -0800 Subject: [PATCH 011/131] ocfs2/dlm: get mle inuse only when it is initialized When dlm_add_migration_mle returns -EEXIST, previously input mle will not be initialized. So we can't use its associated dlm object. And we truly don't need this mle for already launched migration progress, since oldmle has taken this role. Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373CED7AA61@H3CMLB14-EX.srv.huawei-3com.com Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 3e04279446e8..9c3e0f13ca87 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2616,7 +2616,9 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, * otherwise the assert_master from the new * master will destroy this. */ - dlm_get_mle_inuse(mle); + if (ret != -EEXIST) + dlm_get_mle_inuse(mle); + spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); From 47ee9d89f04ccce34b6f076258d96d0ba3f5daa9 Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Wed, 15 Nov 2017 17:31:55 -0800 Subject: [PATCH 012/131] ocfs2: remove unneeded goto in ocfs2_reserve_cluster_bitmap_bits() Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA4F3CDE3A9@H3CMLB14-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 71f22c8fbffd..9f0b95abc09f 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -1147,12 +1147,9 @@ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, GLOBAL_BITMAP_SYSTEM_INODE, OCFS2_INVALID_SLOT, NULL, ALLOC_NEW_GROUP); - if (status < 0 && status != -ENOSPC) { + if (status < 0 && status != -ENOSPC) mlog_errno(status); - goto bail; - } -bail: return status; } From 7ad3f188aac15772c97523dc4ca3e8e5b6294b9c Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 15 Nov 2017 17:31:59 -0800 Subject: [PATCH 013/131] tools: slabinfo: add "-U" option to show unreclaimable slabs only Patch series "oom: capture unreclaimable slab info in oom message", v10. Recently we ran into a oom issue, kernel panic due to no killable process. The dmesg shows huge unreclaimable slabs used almost 100% memory, but kdump doesn't capture vmcore due to some reason. So, it may sound better to capture unreclaimable slab info in oom message when kernel panic to aid trouble shooting and cover the corner case. Since kernel already panic, so capturing more information sounds worthy and doesn't bother normal oom killer. With the patchset, tools/vm/slabinfo has a new option, "-U", to show unreclaimable slab only. And, oom will print all non zero (num_objs * size != 0) unreclaimable slabs in oom killer message. This patch (of 3): Add "-U" option to show unreclaimable slabs only. "-U" and "-S" together can tell us what unreclaimable slabs use the most memory to help debug huge unreclaimable slabs issue. Link: http://lkml.kernel.org/r/1507152550-46205-2-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/slabinfo.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index b0b7ef6d0de1..f82c2eaa859d 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -84,6 +84,7 @@ int output_lines = -1; int sort_loss; int extended_totals; int show_bytes; +int unreclaim_only; /* Debug options */ int sanity; @@ -133,6 +134,7 @@ static void usage(void) "-L|--Loss Sort by loss\n" "-X|--Xtotals Show extended summary information\n" "-B|--Bytes Show size in bytes\n" + "-U|--Unreclaim Show unreclaimable slabs only\n" "\nValid debug options (FZPUT may be combined)\n" "a / A Switch on all debug options (=FZUP)\n" "- Switch off all debug options\n" @@ -569,6 +571,9 @@ static void slabcache(struct slabinfo *s) if (strcmp(s->name, "*") == 0) return; + if (unreclaim_only && s->reclaim_account) + return; + if (actual_slabs == 1) { report(s); return; @@ -1347,6 +1352,7 @@ struct option opts[] = { { "Loss", no_argument, NULL, 'L'}, { "Xtotals", no_argument, NULL, 'X'}, { "Bytes", no_argument, NULL, 'B'}, + { "Unreclaim", no_argument, NULL, 'U'}, { NULL, 0, NULL, 0 } }; @@ -1358,7 +1364,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); - while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXB", + while ((c = getopt_long(argc, argv, "aAd::Defhil1noprstvzTSN:LXBU", opts, NULL)) != -1) switch (c) { case '1': @@ -1439,6 +1445,9 @@ int main(int argc, char *argv[]) case 'B': show_bytes = 1; break; + case 'U': + unreclaim_only = 1; + break; default: fatal("%s: Invalid option '%c'\n", argv[0], optopt); From 5b36577109be007a6ecf4b65b54cbc9118463c2b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 15 Nov 2017 17:32:03 -0800 Subject: [PATCH 014/131] mm: slabinfo: remove CONFIG_SLABINFO According to discussion with Christoph (https://marc.info/?l=linux-kernel&m=150695909709711&w=2), it sounds like it is pointless to keep CONFIG_SLABINFO around. This patch removes the CONFIG_SLABINFO config option, but /proc/slabinfo is still available. [yang.s@alibaba-inc.com: v11] Link: http://lkml.kernel.org/r/1507656303-103845-3-git-send-email-yang.s@alibaba-inc.com Link: http://lkml.kernel.org/r/1507152550-46205-3-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Acked-by: David Rientjes Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Michal Hocko Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- init/Kconfig | 6 ------ mm/memcontrol.c | 2 +- mm/slab.c | 2 -- mm/slab_common.c | 7 +++---- mm/slub.c | 4 ++-- 5 files changed, 6 insertions(+), 15 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index 2431e0504479..9b8c187873fe 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1664,12 +1664,6 @@ config HAVE_GENERIC_DMA_COHERENT bool default n -config SLABINFO - bool - depends on PROC_FS - depends on SLAB || SLUB_DEBUG - default y - config RT_MUTEXES bool diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 661f046ad318..50e6906314f8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4049,7 +4049,7 @@ static struct cftype mem_cgroup_legacy_files[] = { .write = mem_cgroup_reset, .read_u64 = mem_cgroup_read_u64, }, -#ifdef CONFIG_SLABINFO +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) { .name = "kmem.slabinfo", .seq_start = memcg_slab_start, diff --git a/mm/slab.c b/mm/slab.c index b7095884fd93..1a6797eec828 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -4097,7 +4097,6 @@ out: schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_AC)); } -#ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { unsigned long active_objs, num_objs, active_slabs; @@ -4405,7 +4404,6 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); -#endif #ifdef CONFIG_HARDENED_USERCOPY /* diff --git a/mm/slab_common.c b/mm/slab_common.c index 0d7fe71ff5e4..9357353bcb64 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1184,8 +1184,7 @@ void cache_random_seq_destroy(struct kmem_cache *cachep) } #endif /* CONFIG_SLAB_FREELIST_RANDOM */ -#ifdef CONFIG_SLABINFO - +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) #ifdef CONFIG_SLAB #define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) #else @@ -1281,7 +1280,7 @@ static int slab_show(struct seq_file *m, void *p) return 0; } -#if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) +#if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -1355,7 +1354,7 @@ static int __init slab_proc_init(void) return 0; } module_init(slab_proc_init); -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */ static __always_inline void *__do_krealloc(const void *p, size_t new_size, gfp_t flags) diff --git a/mm/slub.c b/mm/slub.c index 1efbb8123037..025bbb540f3d 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5852,7 +5852,7 @@ __initcall(slab_sysfs_init); /* * The /proc/slabinfo ABI */ -#ifdef CONFIG_SLABINFO +#ifdef CONFIG_SLUB_DEBUG void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { unsigned long nr_slabs = 0; @@ -5884,4 +5884,4 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer, { return -EIO; } -#endif /* CONFIG_SLABINFO */ +#endif /* CONFIG_SLUB_DEBUG */ From 852d8be0ad8511611eff18f28dce11d25195b654 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 15 Nov 2017 17:32:07 -0800 Subject: [PATCH 015/131] mm: oom: show unreclaimable slab info when unreclaimable slabs > user memory The kernel may panic when an oom happens without killable process sometimes it is caused by huge unreclaimable slabs used by kernel. Although kdump could help debug such problem, however, kdump is not available on all architectures and it might be malfunction sometime. And, since kernel already panic it is worthy capturing such information in dmesg to aid touble shooting. Print out unreclaimable slab info (used size and total size) which actual memory usage is not zero (num_objs * size != 0) when unreclaimable slabs amount is greater than total user memory (LRU pages). The output looks like: Unreclaimable slab info: Name Used Total rpc_buffers 31KB 31KB rpc_tasks 7KB 7KB ebitmap_node 1964KB 1964KB avtab_node 5024KB 5024KB xfs_buf 1402KB 1402KB xfs_ili 134KB 134KB xfs_efi_item 115KB 115KB xfs_efd_item 115KB 115KB xfs_buf_item 134KB 134KB xfs_log_item_desc 342KB 342KB xfs_trans 1412KB 1412KB xfs_ifork 212KB 212KB [yang.s@alibaba-inc.com: v11] Link: http://lkml.kernel.org/r/1507656303-103845-4-git-send-email-yang.s@alibaba-inc.com Link: http://lkml.kernel.org/r/1507152550-46205-4-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Acked-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 27 +++++++++++++++++++++++++-- mm/slab.h | 8 ++++++++ mm/slab_common.c | 34 ++++++++++++++++++++++++++++++++++ 3 files changed, 67 insertions(+), 2 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dee0f75c3013..3023919970f7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include "internal.h" +#include "slab.h" #define CREATE_TRACE_POINTS #include @@ -161,6 +162,25 @@ static bool oom_unkillable_task(struct task_struct *p, return false; } +/* + * Print out unreclaimble slabs info when unreclaimable slabs amount is greater + * than all user memory (LRU pages) + */ +static bool is_dump_unreclaim_slabs(void) +{ + unsigned long nr_lru; + + nr_lru = global_node_page_state(NR_ACTIVE_ANON) + + global_node_page_state(NR_INACTIVE_ANON) + + global_node_page_state(NR_ACTIVE_FILE) + + global_node_page_state(NR_INACTIVE_FILE) + + global_node_page_state(NR_ISOLATED_ANON) + + global_node_page_state(NR_ISOLATED_FILE) + + global_node_page_state(NR_UNEVICTABLE); + + return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru); +} + /** * oom_badness - heuristic function to determine which candidate task to kill * @p: task struct of which task we should calculate @@ -420,10 +440,13 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) cpuset_print_current_mems_allowed(); dump_stack(); - if (oc->memcg) + if (is_memcg_oom(oc)) mem_cgroup_print_oom_info(oc->memcg, p); - else + else { show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); + if (is_dump_unreclaim_slabs()) + dump_unreclaimable_slab(); + } if (sysctl_oom_dump_tasks) dump_tasks(oc->memcg, oc->nodemask); } diff --git a/mm/slab.h b/mm/slab.h index 86d7c7d860f9..45c586cefc11 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -506,6 +506,14 @@ void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos); void memcg_slab_stop(struct seq_file *m, void *p); int memcg_slab_show(struct seq_file *m, void *p); +#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG) +void dump_unreclaimable_slab(void); +#else +static inline void dump_unreclaimable_slab(void) +{ +} +#endif + void ___cache_free(struct kmem_cache *cache, void *x, unsigned long addr); #ifdef CONFIG_SLAB_FREELIST_RANDOM diff --git a/mm/slab_common.c b/mm/slab_common.c index 9357353bcb64..8f7f9f75d7ea 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1280,6 +1280,40 @@ static int slab_show(struct seq_file *m, void *p) return 0; } +void dump_unreclaimable_slab(void) +{ + struct kmem_cache *s, *s2; + struct slabinfo sinfo; + + /* + * Here acquiring slab_mutex is risky since we don't prefer to get + * sleep in oom path. But, without mutex hold, it may introduce a + * risk of crash. + * Use mutex_trylock to protect the list traverse, dump nothing + * without acquiring the mutex. + */ + if (!mutex_trylock(&slab_mutex)) { + pr_warn("excessive unreclaimable slab but cannot dump stats\n"); + return; + } + + pr_info("Unreclaimable slab info:\n"); + pr_info("Name Used Total\n"); + + list_for_each_entry_safe(s, s2, &slab_caches, list) { + if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT)) + continue; + + get_slabinfo(s, &sinfo); + + if (sinfo.num_objs > 0) + pr_info("%-17s %10luKB %10luKB\n", cache_name(s), + (sinfo.active_objs * s->size) / 1024, + (sinfo.num_objs * s->size) / 1024); + } + mutex_unlock(&slab_mutex); +} + #if defined(CONFIG_MEMCG) void *memcg_slab_start(struct seq_file *m, loff_t *pos) { From 9f88faee3ff7d6e8b09c9d23b7d4ac0c15a3eae9 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 15 Nov 2017 17:32:10 -0800 Subject: [PATCH 016/131] mm/slob.c: remove an unnecessary check for __GFP_ZERO Current flow guarantees a valid pointer when handling the __GFP_ZERO case. So remove the unnecessary NULL pointer check. Link: http://lkml.kernel.org/r/1507203141-11959-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slob.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slob.c b/mm/slob.c index 10249160b693..3451ecad8e35 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -330,7 +330,7 @@ static void *slob_alloc(size_t size, gfp_t gfp, int align, int node) BUG_ON(!b); spin_unlock_irqrestore(&slob_lock, flags); } - if (unlikely((gfp & __GFP_ZERO) && b)) + if (unlikely(gfp & __GFP_ZERO)) memset(b, 0, size); return b; } From a3ba074447824625d3a267a5fffd2ea21556ebf4 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 15 Nov 2017 17:32:14 -0800 Subject: [PATCH 017/131] mm/slab.c: only set __GFP_RECLAIMABLE once SLAB_RECLAIM_ACCOUNT is a permanent attribute of a slab cache. Set __GFP_RECLAIMABLE as part of its ->allocflags rather than check the cachep flag on every page allocation. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1710171527560.140898@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 1a6797eec828..0c6468c07b01 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1410,8 +1410,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nr_pages; flags |= cachep->allocflags; - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) - flags |= __GFP_RECLAIMABLE; page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { @@ -2144,6 +2142,8 @@ done: cachep->allocflags = __GFP_COMP; if (flags & SLAB_CACHE_DMA) cachep->allocflags |= GFP_DMA; + if (flags & SLAB_RECLAIM_ACCOUNT) + cachep->allocflags |= __GFP_RECLAIMABLE; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); From d50112edde1d0c621520e53747044009f11c656b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:18 -0800 Subject: [PATCH 018/131] slab, slub, slob: add slab_flags_t Add sparse-checked slab_flags_t for struct kmem_cache::flags (SLAB_POISON, etc). SLAB is bloated temporarily by switching to "unsigned long", but only temporarily. Link: http://lkml.kernel.org/r/20171021100225.GA22428@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/main.c | 2 +- fs/xfs/kmem.h | 2 +- include/linux/kasan.h | 4 +-- include/linux/kmemleak.h | 8 +++--- include/linux/slab.h | 60 +++++++++++++++++++++++++--------------- include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/linux/types.h | 1 + include/net/sock.h | 2 +- mm/kasan/kasan.c | 2 +- mm/slab.c | 23 ++++++++------- mm/slab.h | 26 ++++++++--------- mm/slab_common.c | 16 +++++------ mm/slob.c | 2 +- mm/slub.c | 26 +++++++++-------- 15 files changed, 97 insertions(+), 81 deletions(-) diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 6b801186baa5..25aeaa7328ba 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -660,7 +660,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; - unsigned long flags; + slab_flags_t flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index 758f37ac5ad3..4b87472f35bc 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -104,7 +104,7 @@ kmem_zone_init(int size, char *zone_name) } static inline kmem_zone_t * -kmem_zone_init_flags(int size, char *zone_name, unsigned long flags, +kmem_zone_init_flags(int size, char *zone_name, slab_flags_t flags, void (*construct)(void *)) { return kmem_cache_create(zone_name, size, 0, flags, construct); diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5017269e3f04..e3eb834c9a35 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -46,7 +46,7 @@ void kasan_alloc_pages(struct page *page, unsigned int order); void kasan_free_pages(struct page *page, unsigned int order); void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags); + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -95,7 +95,7 @@ static inline void kasan_free_pages(struct page *page, unsigned int order) {} static inline void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) {} + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 590343f6c1b1..5ac416e2d339 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -48,14 +48,14 @@ extern void kmemleak_not_leak_phys(phys_addr_t phys) __ref; extern void kmemleak_ignore_phys(phys_addr_t phys) __ref; static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_alloc(ptr, size, min_count, gfp); } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { if (!(flags & SLAB_NOLEAKTRACE)) kmemleak_free(ptr); @@ -76,7 +76,7 @@ static inline void kmemleak_alloc(const void *ptr, size_t size, int min_count, { } static inline void kmemleak_alloc_recursive(const void *ptr, size_t size, - int min_count, unsigned long flags, + int min_count, slab_flags_t flags, gfp_t gfp) { } @@ -94,7 +94,7 @@ static inline void kmemleak_free(const void *ptr) static inline void kmemleak_free_part(const void *ptr, size_t size) { } -static inline void kmemleak_free_recursive(const void *ptr, unsigned long flags) +static inline void kmemleak_free_recursive(const void *ptr, slab_flags_t flags) { } static inline void kmemleak_free_percpu(const void __percpu *ptr) diff --git a/include/linux/slab.h b/include/linux/slab.h index af5aa65c7c18..0c4c579f52ed 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -21,13 +21,20 @@ * Flags to pass to kmem_cache_create(). * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ -#define SLAB_CONSISTENCY_CHECKS 0x00000100UL /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_RED_ZONE 0x00000400UL /* DEBUG: Red zone objs in a cache */ -#define SLAB_POISON 0x00000800UL /* DEBUG: Poison objects */ -#define SLAB_HWCACHE_ALIGN 0x00002000UL /* Align objs on cache lines */ -#define SLAB_CACHE_DMA 0x00004000UL /* Use GFP_DMA memory */ -#define SLAB_STORE_USER 0x00010000UL /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_PANIC 0x00040000UL /* Panic if kmem_cache_create() fails */ +/* DEBUG: Perform (expensive) checks on alloc/free */ +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100UL) +/* DEBUG: Red zone objs in a cache */ +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400UL) +/* DEBUG: Poison objects */ +#define SLAB_POISON ((slab_flags_t __force)0x00000800UL) +/* Align objs on cache lines */ +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000UL) +/* Use GFP_DMA memory */ +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000UL) +/* DEBUG: Store the last owner for bug hunting */ +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000UL) +/* Panic if kmem_cache_create() fails */ +#define SLAB_PANIC ((slab_flags_t __force)0x00040000UL) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -65,44 +72,51 @@ * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ -#define SLAB_TYPESAFE_BY_RCU 0x00080000UL /* Defer freeing slabs to RCU */ -#define SLAB_MEM_SPREAD 0x00100000UL /* Spread some memory over cpuset */ -#define SLAB_TRACE 0x00200000UL /* Trace allocations and frees */ +/* Defer freeing slabs to RCU */ +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000UL) +/* Spread some memory over cpuset */ +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000UL) +/* Trace allocations and frees */ +#define SLAB_TRACE ((slab_flags_t __force)0x00200000UL) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS 0x00400000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000UL) #else -# define SLAB_DEBUG_OBJECTS 0x00000000UL +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00000000UL) #endif -#define SLAB_NOLEAKTRACE 0x00800000UL /* Avoid kmemleak tracing */ +/* Avoid kmemleak tracing */ +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000UL) /* Don't track use of uninitialized memory */ #ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK 0x01000000UL +# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000UL) #else -# define SLAB_NOTRACK 0x00000000UL +# define SLAB_NOTRACK ((slab_flags_t __force)0x00000000UL) #endif +/* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB 0x02000000UL /* Fault injection mark */ +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000UL) #else -# define SLAB_FAILSLAB 0x00000000UL +# define SLAB_FAILSLAB ((slab_flags_t __force)0x00000000UL) #endif +/* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT 0x04000000UL /* Account to memcg */ +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000UL) #else -# define SLAB_ACCOUNT 0x00000000UL +# define SLAB_ACCOUNT ((slab_flags_t __force)0x00000000UL) #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN 0x08000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x08000000UL) #else -#define SLAB_KASAN 0x00000000UL +#define SLAB_KASAN ((slab_flags_t __force)0x00000000UL) #endif /* The following flags affect the page allocator grouping pages by mobility */ -#define SLAB_RECLAIM_ACCOUNT 0x00020000UL /* Objects are reclaimable */ +/* Objects are reclaimable */ +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000UL) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. @@ -128,7 +142,7 @@ void __init kmem_cache_init(void); bool slab_is_available(void); struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, - unsigned long, + slab_flags_t, void (*)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 8f7d2b1656d2..072e46e9e1d5 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -20,7 +20,7 @@ struct kmem_cache { struct reciprocal_value reciprocal_buffer_size; /* 2) touched by every alloc & free from the backend */ - unsigned int flags; /* constant flags */ + slab_flags_t flags; /* constant flags */ unsigned int num; /* # of objs per slab */ /* 3) cache_grow/shrink */ diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 39fa09bcde23..0adae162dc8f 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -82,7 +82,7 @@ struct kmem_cache_order_objects { struct kmem_cache { struct kmem_cache_cpu __percpu *cpu_slab; /* Used for retriving partial slabs etc */ - unsigned long flags; + slab_flags_t flags; unsigned long min_partial; int size; /* The size of an object including meta data */ int object_size; /* The size of an object without meta data */ diff --git a/include/linux/types.h b/include/linux/types.h index 34fce54e4f1b..732b52c2eae4 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,6 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; +typedef unsigned long __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT diff --git a/include/net/sock.h b/include/net/sock.h index a6b9a8d1a6df..c577286dbffb 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1105,7 +1105,7 @@ struct proto { struct kmem_cache *slab; unsigned int obj_size; - int slab_flags; + slab_flags_t slab_flags; struct percpu_counter *orphan_count; diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 6f319fb81718..405bba487df5 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -337,7 +337,7 @@ static size_t optimal_redzone(size_t object_size) } void kasan_cache_create(struct kmem_cache *cache, size_t *size, - unsigned long *flags) + slab_flags_t *flags) { int redzone_adjust; int orig_size = *size; diff --git a/mm/slab.c b/mm/slab.c index 0c6468c07b01..19b1b9f99819 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -252,8 +252,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -#define CFLGS_OBJFREELIST_SLAB (0x40000000UL) -#define CFLGS_OFF_SLAB (0x80000000UL) +#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000UL) +#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000UL) #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) @@ -441,7 +441,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) * Calculate the number of objects and left-over bytes for a given buffer size. */ static unsigned int cache_estimate(unsigned long gfporder, size_t buffer_size, - unsigned long flags, size_t *left_over) + slab_flags_t flags, size_t *left_over) { unsigned int num; size_t slab_size = PAGE_SIZE << gfporder; @@ -1759,7 +1759,7 @@ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) * towards high-order requests, this should be changed. */ static size_t calculate_slab_order(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left_over = 0; int gfporder; @@ -1886,8 +1886,8 @@ static int __ref setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) return 0; } -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -1895,7 +1895,7 @@ unsigned long kmem_cache_flags(unsigned long object_size, struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *cachep; @@ -1913,7 +1913,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, } static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -1936,7 +1936,7 @@ static bool set_objfreelist_slab_cache(struct kmem_cache *cachep, } static bool set_off_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -1970,7 +1970,7 @@ static bool set_off_slab_cache(struct kmem_cache *cachep, } static bool set_on_slab_cache(struct kmem_cache *cachep, - size_t size, unsigned long flags) + size_t size, slab_flags_t flags) { size_t left; @@ -2006,8 +2006,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * cacheline. This can be beneficial if you're counting cycles as closely * as davem. */ -int -__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *cachep, slab_flags_t flags) { size_t ralign = BYTES_PER_WORD; gfp_t gfp; diff --git a/mm/slab.h b/mm/slab.h index 45c586cefc11..e19255638cb6 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -21,7 +21,7 @@ struct kmem_cache { unsigned int object_size;/* The original size of the object */ unsigned int size; /* The aligned/padded/added on size */ unsigned int align; /* Alignment as calculated */ - unsigned long flags; /* Active flags on the slab */ + slab_flags_t flags; /* Active flags on the slab */ const char *name; /* Slab name for sysfs */ int refcount; /* Use counter */ void (*ctor)(void *); /* Called on object slot creation */ @@ -79,13 +79,13 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(unsigned long flags, +unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size); #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); -void create_kmalloc_caches(unsigned long); +void create_kmalloc_caches(slab_flags_t); /* Find the kmalloc slab corresponding for a certain size */ struct kmem_cache *kmalloc_slab(size_t, gfp_t); @@ -93,32 +93,32 @@ struct kmem_cache *kmalloc_slab(size_t, gfp_t); /* Functions provided by the slab allocators */ -extern int __kmem_cache_create(struct kmem_cache *, unsigned long flags); +int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags); extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size, - unsigned long flags); + slab_flags_t flags); extern void create_boot_cache(struct kmem_cache *, const char *name, - size_t size, unsigned long flags); + size_t size, slab_flags_t flags); int slab_unmergeable(struct kmem_cache *s); struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)); + slab_flags_t flags, const char *name, void (*ctor)(void *)); #ifndef CONFIG_SLOB struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)); + slab_flags_t flags, void (*ctor)(void *)); -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)); #else static inline struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { return NULL; } -static inline unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +static inline slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; diff --git a/mm/slab_common.c b/mm/slab_common.c index 8f7f9f75d7ea..175e86637afd 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -291,7 +291,7 @@ int slab_unmergeable(struct kmem_cache *s) } struct kmem_cache *find_mergeable(size_t size, size_t align, - unsigned long flags, const char *name, void (*ctor)(void *)) + slab_flags_t flags, const char *name, void (*ctor)(void *)) { struct kmem_cache *s; @@ -341,7 +341,7 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, * Figure out what the alignment of the objects will be given a set of * flags, a user specified alignment and the size of the objects. */ -unsigned long calculate_alignment(unsigned long flags, +unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size) { /* @@ -366,7 +366,7 @@ unsigned long calculate_alignment(unsigned long flags, static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *), + slab_flags_t flags, void (*ctor)(void *), struct mem_cgroup *memcg, struct kmem_cache *root_cache) { struct kmem_cache *s; @@ -431,7 +431,7 @@ out_free_cache: */ struct kmem_cache * kmem_cache_create(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s = NULL; const char *cache_name; @@ -879,7 +879,7 @@ bool slab_is_available(void) #ifndef CONFIG_SLOB /* Create a cache during boot when no slab services are available yet */ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t size, - unsigned long flags) + slab_flags_t flags) { int err; @@ -899,7 +899,7 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz } struct kmem_cache *__init create_kmalloc_cache(const char *name, size_t size, - unsigned long flags) + slab_flags_t flags) { struct kmem_cache *s = kmem_cache_zalloc(kmem_cache, GFP_NOWAIT); @@ -1057,7 +1057,7 @@ void __init setup_kmalloc_cache_index_table(void) } } -static void __init new_kmalloc_cache(int idx, unsigned long flags) +static void __init new_kmalloc_cache(int idx, slab_flags_t flags) { kmalloc_caches[idx] = create_kmalloc_cache(kmalloc_info[idx].name, kmalloc_info[idx].size, flags); @@ -1068,7 +1068,7 @@ static void __init new_kmalloc_cache(int idx, unsigned long flags) * may already have been created because they were needed to * enable allocations for slab creation. */ -void __init create_kmalloc_caches(unsigned long flags) +void __init create_kmalloc_caches(slab_flags_t flags) { int i; diff --git a/mm/slob.c b/mm/slob.c index 3451ecad8e35..623e8a5c46ce 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -524,7 +524,7 @@ size_t ksize(const void *block) } EXPORT_SYMBOL(ksize); -int __kmem_cache_create(struct kmem_cache *c, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *c, slab_flags_t flags) { if (flags & SLAB_TYPESAFE_BY_RCU) { /* leave room for rcu footer at the end of object */ diff --git a/mm/slub.c b/mm/slub.c index 025bbb540f3d..482d1daa9088 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -193,8 +193,10 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define MAX_OBJS_PER_PAGE 32767 /* since page.objects is u15 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000UL /* Poison object */ -#define __CMPXCHG_DOUBLE 0x40000000UL /* Use cmpxchg_double */ +/* Poison object */ +#define __OBJECT_POISON ((slab_flags_t __force)0x80000000UL) +/* Use cmpxchg_double */ +#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000UL) /* * Tracking user of a slab. @@ -485,9 +487,9 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) * Debug settings: */ #if defined(CONFIG_SLUB_DEBUG_ON) -static int slub_debug = DEBUG_DEFAULT_FLAGS; +static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; #else -static int slub_debug; +static slab_flags_t slub_debug; #endif static char *slub_debug_slabs; @@ -1289,8 +1291,8 @@ out: __setup("slub_debug", setup_slub_debug); -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { /* @@ -1322,8 +1324,8 @@ static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page) {} -unsigned long kmem_cache_flags(unsigned long object_size, - unsigned long flags, const char *name, +slab_flags_t kmem_cache_flags(unsigned long object_size, + slab_flags_t flags, const char *name, void (*ctor)(void *)) { return flags; @@ -3477,7 +3479,7 @@ static void set_cpu_partial(struct kmem_cache *s) */ static int calculate_sizes(struct kmem_cache *s, int forced_order) { - unsigned long flags = s->flags; + slab_flags_t flags = s->flags; size_t size = s->object_size; int order; @@ -3593,7 +3595,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) return !!oo_objects(s->oo); } -static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) +static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); s->reserved = 0; @@ -4245,7 +4247,7 @@ void __init kmem_cache_init_late(void) struct kmem_cache * __kmem_cache_alias(const char *name, size_t size, size_t align, - unsigned long flags, void (*ctor)(void *)) + slab_flags_t flags, void (*ctor)(void *)) { struct kmem_cache *s, *c; @@ -4275,7 +4277,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, return s; } -int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) +int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags) { int err; From 4fd0b46e898791009b03b2fdd6510044fa8730a6 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Wed, 15 Nov 2017 17:32:21 -0800 Subject: [PATCH 019/131] slab, slub, slob: convert slab_flags_t to 32-bit struct kmem_cache::flags is "unsigned long" which is unnecessary on 64-bit as no flags are defined in the higher bits. Switch the field to 32-bit and save some space on x86_64 until such flags appear: add/remove: 0/0 grow/shrink: 0/107 up/down: 0/-657 (-657) function old new delta sysfs_slab_add 720 719 -1 ... check_object 699 676 -23 [akpm@linux-foundation.org: fix printk warning] Link: http://lkml.kernel.org/r/20171021100635.GA8287@avx2 Signed-off-by: Alexey Dobriyan Acked-by: Pekka Enberg Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 44 +++++++++++++++++++++---------------------- include/linux/types.h | 2 +- mm/slab.c | 4 ++-- mm/slub.c | 6 +++--- 4 files changed, 28 insertions(+), 28 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index 0c4c579f52ed..f37cb93768ab 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -22,19 +22,19 @@ * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ /* DEBUG: Perform (expensive) checks on alloc/free */ -#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100UL) +#define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) /* DEBUG: Red zone objs in a cache */ -#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400UL) +#define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ -#define SLAB_POISON ((slab_flags_t __force)0x00000800UL) +#define SLAB_POISON ((slab_flags_t __force)0x00000800U) /* Align objs on cache lines */ -#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000UL) +#define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ -#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000UL) +#define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U) /* DEBUG: Store the last owner for bug hunting */ -#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000UL) +#define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U) /* Panic if kmem_cache_create() fails */ -#define SLAB_PANIC ((slab_flags_t __force)0x00040000UL) +#define SLAB_PANIC ((slab_flags_t __force)0x00040000U) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * @@ -73,50 +73,50 @@ * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ /* Defer freeing slabs to RCU */ -#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000UL) +#define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) /* Spread some memory over cpuset */ -#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000UL) +#define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U) /* Trace allocations and frees */ -#define SLAB_TRACE ((slab_flags_t __force)0x00200000UL) +#define SLAB_TRACE ((slab_flags_t __force)0x00200000U) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000UL) +# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U) #else -# define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00000000UL) +# define SLAB_DEBUG_OBJECTS 0 #endif /* Avoid kmemleak tracing */ -#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000UL) +#define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) /* Don't track use of uninitialized memory */ #ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000UL) +# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000U) #else -# define SLAB_NOTRACK ((slab_flags_t __force)0x00000000UL) +# define SLAB_NOTRACK 0 #endif /* Fault injection mark */ #ifdef CONFIG_FAILSLAB -# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000UL) +# define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) #else -# define SLAB_FAILSLAB ((slab_flags_t __force)0x00000000UL) +# define SLAB_FAILSLAB 0 #endif /* Account to memcg */ #if defined(CONFIG_MEMCG) && !defined(CONFIG_SLOB) -# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000UL) +# define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else -# define SLAB_ACCOUNT ((slab_flags_t __force)0x00000000UL) +# define SLAB_ACCOUNT 0 #endif #ifdef CONFIG_KASAN -#define SLAB_KASAN ((slab_flags_t __force)0x08000000UL) +#define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else -#define SLAB_KASAN ((slab_flags_t __force)0x00000000UL) +#define SLAB_KASAN 0 #endif /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ -#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000UL) +#define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. diff --git a/include/linux/types.h b/include/linux/types.h index 732b52c2eae4..c94d59ef96cc 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -156,7 +156,7 @@ typedef u32 dma_addr_t; #endif typedef unsigned __bitwise gfp_t; -typedef unsigned long __bitwise slab_flags_t; +typedef unsigned __bitwise slab_flags_t; typedef unsigned __bitwise fmode_t; #ifdef CONFIG_PHYS_ADDR_T_64BIT diff --git a/mm/slab.c b/mm/slab.c index 19b1b9f99819..7a5e0888a401 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -252,8 +252,8 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid); \ } while (0) -#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000UL) -#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000UL) +#define CFLGS_OBJFREELIST_SLAB ((slab_flags_t __force)0x40000000U) +#define CFLGS_OFF_SLAB ((slab_flags_t __force)0x80000000U) #define OBJFREELIST_SLAB(x) ((x)->flags & CFLGS_OBJFREELIST_SLAB) #define OFF_SLAB(x) ((x)->flags & CFLGS_OFF_SLAB) diff --git a/mm/slub.c b/mm/slub.c index 482d1daa9088..33957fd376ae 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -194,9 +194,9 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* Internal SLUB flags */ /* Poison object */ -#define __OBJECT_POISON ((slab_flags_t __force)0x80000000UL) +#define __OBJECT_POISON ((slab_flags_t __force)0x80000000U) /* Use cmpxchg_double */ -#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000UL) +#define __CMPXCHG_DOUBLE ((slab_flags_t __force)0x40000000U) /* * Tracking user of a slab. @@ -3657,7 +3657,7 @@ error: if (flags & SLAB_PANIC) panic("Cannot create slab %s size=%lu realsize=%u order=%u offset=%u flags=%lx\n", s->name, (unsigned long)s->size, s->size, - oo_order(s->oo), s->offset, flags); + oo_order(s->oo), s->offset, (unsigned long)flags); return -EINVAL; } From 11066386efa692f77171484c32ea30f6e5a0d729 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 15 Nov 2017 17:32:25 -0800 Subject: [PATCH 020/131] slub: fix sysfs duplicate filename creation when slub_debug=O When slub_debug=O is set. It is possible to clear debug flags for an "unmergeable" slab cache in kmem_cache_open(). It makes the "unmergeable" cache became "mergeable" in sysfs_slab_add(). These caches will generate their "unique IDs" by create_unique_id(), but it is possible to create identical unique IDs. In my experiment, sgpool-128, names_cache, biovec-256 generate the same ID ":Ft-0004096" and the kernel reports "sysfs: cannot create duplicate filename '/kernel/slab/:Ft-0004096'". To repeat my experiment, set disable_higher_order_debug=1, CONFIG_SLUB_DEBUG_ON=y in kernel-4.14. Fix this issue by setting unmergeable=1 if slub_debug=O and the the default slub_debug contains any no-merge flags. call path: kmem_cache_create() __kmem_cache_alias() -> we set SLAB_NEVER_MERGE flags here create_cache() __kmem_cache_create() kmem_cache_open() -> clear DEBUG_METADATA_FLAGS sysfs_slab_add() -> the slab cache is mergeable now sysfs: cannot create duplicate filename '/kernel/slab/:Ft-0004096' ------------[ cut here ]------------ WARNING: CPU: 0 PID: 1 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x60/0x7c Modules linked in: CPU: 0 PID: 1 Comm: swapper/0 Tainted: G W 4.14.0-rc7ajb-00131-gd4c2e9f-dirty #123 Hardware name: linux,dummy-virt (DT) task: ffffffc07d4e0080 task.stack: ffffff8008008000 PC is at sysfs_warn_dup+0x60/0x7c LR is at sysfs_warn_dup+0x60/0x7c pc : lr : pstate: 60000145 Call trace: sysfs_warn_dup+0x60/0x7c sysfs_create_dir_ns+0x98/0xa0 kobject_add_internal+0xa0/0x294 kobject_init_and_add+0x90/0xb4 sysfs_slab_add+0x90/0x200 __kmem_cache_create+0x26c/0x438 kmem_cache_create+0x164/0x1f4 sg_pool_init+0x60/0x100 do_one_initcall+0x38/0x12c kernel_init_freeable+0x138/0x1d4 kernel_init+0x10/0xfc ret_from_fork+0x10/0x18 Link: http://lkml.kernel.org/r/1510365805-5155-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mm/slub.c b/mm/slub.c index 33957fd376ae..51484f0fc068 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5706,6 +5706,10 @@ static int sysfs_slab_add(struct kmem_cache *s) return 0; } + if (!unmergeable && disable_higher_order_debug && + (slub_debug & DEBUG_METADATA_FLAGS)) + unmergeable = 1; + if (unmergeable) { /* * Slabcache can never be merged so we can use the name proper. From 5799b255c491d853b73fb9e0e1760210315d06cd Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:29 -0800 Subject: [PATCH 021/131] include/linux/slab.h: add kmalloc_array_node() and kcalloc_node() Patch series "Add kmalloc_array_node() and kcalloc_node()". Our current memeory allocation routines suffer form an API imbalance, for one we have kmalloc_array() and kcalloc() which check for overflows in size multiplication and we have kmalloc_node() and kzalloc_node() which allow for memory allocation on a certain NUMA node but don't check for eventual overflows. This patch (of 6): We have kmalloc_array() and kcalloc() wrappers on top of kmalloc() which ensure us overflow free multiplication for the size of a memory allocation but these implementations are not NUMA-aware. Likewise we have kmalloc_node() which is a NUMA-aware version of kmalloc() but the implementation is not aware of any possible overflows in eventual size calculations. Introduce a combination of the two above cases to have a NUMA-node aware version of kmalloc_array() and kcalloc(). Link: http://lkml.kernel.org/r/20170927082038.3782-2-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Christoph Lameter Cc: Damien Le Moal Cc: David Rientjes Cc: "David S. Miller" Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Santosh Shilimkar Cc: Sean Hefty Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/include/linux/slab.h b/include/linux/slab.h index f37cb93768ab..ebddfca9e24b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -650,6 +650,22 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) +static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, + int node) +{ + if (size != 0 && n > SIZE_MAX / size) + return NULL; + if (__builtin_constant_p(n) && __builtin_constant_p(size)) + return kmalloc_node(n * size, flags, node); + return __kmalloc_node(n * size, flags, node); +} + +static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) +{ + return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); +} + + #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ From d904bfa79f9ca683d4e51c8d38f79f8be0f70f2a Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:33 -0800 Subject: [PATCH 022/131] block/blk-mq.c: use kmalloc_array_node() Now that we have a NUMA-aware version of kmalloc_array() we can use it instead of kmalloc_node() without an overflow check in the size calculation. Link: http://lkml.kernel.org/r/20170927082038.3782-3-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Damien Le Moal Cc: Christoph Hellwig Cc: Jens Axboe Cc: "David S. Miller" Cc: Doug Ledford Cc: Hal Rosenstock Cc: Mike Marciniszyn Cc: Santosh Shilimkar Cc: Sean Hefty Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-mq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index b600463791ec..11097477eeab 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -2047,7 +2047,7 @@ static int blk_mq_init_hctx(struct request_queue *q, * Allocate space for all possible cpus to avoid allocation at * runtime */ - hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), + hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), GFP_KERNEL, node); if (!hctx->ctxs) goto unregister_cpu_notifier; From 7d50207163414d38e6a706904bd915db145fb8a0 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:37 -0800 Subject: [PATCH 023/131] drivers/infiniband/hw/qib/qib_init.c: use kmalloc_array_node() Now that we have a NUMA-aware version of kmalloc_array() we can use it instead of kmalloc_node() without an overflow check in the size calculation. Link: http://lkml.kernel.org/r/20170927082038.3782-4-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Lameter Cc: Mike Marciniszyn Cc: Doug Ledford Cc: Sean Hefty Cc: Hal Rosenstock Cc: Christoph Hellwig Cc: Damien Le Moal Cc: David Rientjes Cc: "David S. Miller" Cc: Jens Axboe Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Santosh Shilimkar Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/qib/qib_init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/hw/qib/qib_init.c b/drivers/infiniband/hw/qib/qib_init.c index c5a4c65636d6..6aebf4e707b9 100644 --- a/drivers/infiniband/hw/qib/qib_init.c +++ b/drivers/infiniband/hw/qib/qib_init.c @@ -1674,8 +1674,9 @@ int qib_setup_eagerbufs(struct qib_ctxtdata *rcd) } if (!rcd->rcvegrbuf_phys) { rcd->rcvegrbuf_phys = - kmalloc_node(chunk * sizeof(rcd->rcvegrbuf_phys[0]), - GFP_KERNEL, rcd->node_id); + kmalloc_array_node(chunk, + sizeof(rcd->rcvegrbuf_phys[0]), + GFP_KERNEL, rcd->node_id); if (!rcd->rcvegrbuf_phys) goto bail_rcvegrbuf; } From 3c073478414db89ec963631f2aa46c94d4bb33da Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:41 -0800 Subject: [PATCH 024/131] drivers/infiniband/sw/rdmavt/qp.c: use kmalloc_array_node() Now that we have a NUMA-aware version of kmalloc_array() we can use it instead of kmalloc_node() without an overflow check in the size calculation. Link: http://lkml.kernel.org/r/20170927082038.3782-5-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Lameter Cc: Christoph Hellwig Cc: Damien Le Moal Cc: David Rientjes Cc: "David S. Miller" Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Santosh Shilimkar Cc: Sean Hefty Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/sw/rdmavt/qp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/sw/rdmavt/qp.c b/drivers/infiniband/sw/rdmavt/qp.c index b670cb9d2006..27a1ad79d654 100644 --- a/drivers/infiniband/sw/rdmavt/qp.c +++ b/drivers/infiniband/sw/rdmavt/qp.c @@ -238,7 +238,7 @@ int rvt_driver_qp_init(struct rvt_dev_info *rdi) rdi->qp_dev->qp_table_size = rdi->dparms.qp_table_size; rdi->qp_dev->qp_table_bits = ilog2(rdi->dparms.qp_table_size); rdi->qp_dev->qp_table = - kmalloc_node(rdi->qp_dev->qp_table_size * + kmalloc_array_node(rdi->qp_dev->qp_table_size, sizeof(*rdi->qp_dev->qp_table), GFP_KERNEL, rdi->dparms.node); if (!rdi->qp_dev->qp_table) From 63762f50548aa27dc4c380638fa6fed43ae72258 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:45 -0800 Subject: [PATCH 025/131] mm/mempool.c: use kmalloc_array_node() Now that we have a NUMA-aware version of kmalloc_array() we can use it instead of kmalloc_node() without an overflow check in the size calculation. Link: http://lkml.kernel.org/r/20170927082038.3782-6-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Lameter Cc: Christoph Hellwig Cc: Damien Le Moal Cc: David Rientjes Cc: "David S. Miller" Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Santosh Shilimkar Cc: Sean Hefty Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempool.c b/mm/mempool.c index c4a23cdae3f0..7d8c5a0010a2 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -189,7 +189,7 @@ mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, pool = kzalloc_node(sizeof(*pool), gfp_mask, node_id); if (!pool) return NULL; - pool->elements = kmalloc_node(min_nr * sizeof(void *), + pool->elements = kmalloc_array_node(min_nr, sizeof(void *), gfp_mask, node_id); if (!pool->elements) { kfree(pool); From c413af877ffa8a8c28796c9d156cf40a83e58994 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Wed, 15 Nov 2017 17:32:49 -0800 Subject: [PATCH 026/131] net/rds/ib_fmr.c: use kmalloc_array_node() Now that we have a NUMA-aware version of kmalloc_array() we can use it instead of kmalloc_node() without an overflow check in the size calculation. Link: http://lkml.kernel.org/r/20170927082038.3782-7-jthumshirn@suse.de Signed-off-by: Johannes Thumshirn Reviewed-by: Christoph Lameter Cc: Santosh Shilimkar Cc: "David S. Miller" Cc: Christoph Hellwig Cc: Damien Le Moal Cc: David Rientjes Cc: Doug Ledford Cc: Hal Rosenstock Cc: Jens Axboe Cc: Joonsoo Kim Cc: Mike Marciniszyn Cc: Pekka Enberg Cc: Sean Hefty Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/rds/ib_fmr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c index 86ef907067bb..e0f70c4051b6 100644 --- a/net/rds/ib_fmr.c +++ b/net/rds/ib_fmr.c @@ -139,8 +139,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, return -EINVAL; } - dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, - rdsibdev_to_node(rds_ibdev)); + dma_pages = kmalloc_array_node(sizeof(u64), page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) { ib_dma_unmap_sg(dev, sg, nents, DMA_BIDIRECTIONAL); return -ENOMEM; From 41710443f790b5f7f5305eba99dacce88e259f4c Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 15 Nov 2017 17:32:53 -0800 Subject: [PATCH 027/131] mm: update comments for struct page.mapping struct page.mapping can be NULL or points to one object of type address_space, anon_vma or KSM private structure. Link: http://lkml.kernel.org/r/1506485067-15954-1-git-send-email-changbin.du@intel.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c85f11dafd56..d1b8e8f97fc2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -48,8 +48,10 @@ struct page { * inode address_space, or NULL. * If page mapped as anonymous * memory, low bit is set, and - * it points to anon_vma object: - * see PAGE_MAPPING_ANON below. + * it points to anon_vma object + * or KSM private structure. See + * PAGE_MAPPING_ANON and + * PAGE_MAPPING_KSM. */ void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ From e447a0151f7ce8dd884fea48279274bd64434c29 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:32:56 -0800 Subject: [PATCH 028/131] zram: set BDI_CAP_STABLE_WRITES once With fast swap storage, the platform wants to use swap more aggressively and swap-in is crucial to application latency. The rw_page() based synchronous devices like zram, pmem and btt are such fast storage. When I profile swapin performance with zram lz4 decompress test, S/W overhead is more than 70%. Maybe, it would be bigger in nvdimm. This patchset reduces swap-in latency by skipping swapcache if the swap device is a synchronous device like a rw_page() based device. It enhances by 45% my swapin test (5G sequential swapin, no readahead) from 2.41sec to 1.64sec. This patch (of 4): Commit 19b7ccf8651d ("block: get rid of blk_integrity_revalidate()") fixed a weird thing (i.e., reset BDI_CAP_STABLE_WRITES flag unconditionally whenever revalidat_disk is called) so zram doesn't need to reset the flag any more when revalidating the bdev. Instead, set the flag just once when the zram device is created. It shouldn't change any behavior. Link: http://lkml.kernel.org/r/1505886205-9671-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Ilya Dryomov Cc: Christoph Hellwig Cc: Dan Williams Cc: Ross Zwisler Cc: Jens Axboe Cc: Hugh Dickins Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f149d3e61234..d95bb8ce5092 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -122,14 +122,6 @@ static inline bool is_partial_io(struct bio_vec *bvec) } #endif -static void zram_revalidate_disk(struct zram *zram) -{ - revalidate_disk(zram->disk); - /* revalidate_disk reset the BDI_CAP_STABLE_WRITES so set again */ - zram->disk->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; -} - /* * Check if request is within bounds and aligned on zram logical blocks. */ @@ -1373,7 +1365,8 @@ static ssize_t disksize_store(struct device *dev, zram->comp = comp; zram->disksize = disksize; set_capacity(zram->disk, zram->disksize >> SECTOR_SHIFT); - zram_revalidate_disk(zram); + + revalidate_disk(zram->disk); up_write(&zram->init_lock); return len; @@ -1420,7 +1413,7 @@ static ssize_t reset_store(struct device *dev, /* Make sure all the pending I/O are finished */ fsync_bdev(bdev); zram_reset_device(zram); - zram_revalidate_disk(zram); + revalidate_disk(zram->disk); bdput(bdev); mutex_lock(&bdev->bd_mutex); @@ -1539,6 +1532,7 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ queue_flag_set_unlocked(QUEUE_FLAG_NONROT, zram->disk->queue); queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue); + /* * To ensure that we always get PAGE_SIZE aligned * and n*PAGE_SIZED sized I/O requests. @@ -1563,6 +1557,8 @@ static int zram_add(void) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); + zram->disk->queue->backing_dev_info->capabilities |= + BDI_CAP_STABLE_WRITES; add_disk(zram->disk); ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, From 23c47d2ada9f96731492a67b28c0072715075baa Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:00 -0800 Subject: [PATCH 029/131] bdi: introduce BDI_CAP_SYNCHRONOUS_IO As discussed at https://lkml.kernel.org/r/<20170728165604.10455-1-ross.zwisler@linux.intel.com> someday we will remove rw_page(). If so, we need something to detect such super-fast storage on which synchronous IO operations like the current rw_page are always a win. Introduces BDI_CAP_SYNCHRONOUS_IO to indicate such devices. With it, we could use various optimization techniques. Link: http://lkml.kernel.org/r/1505886205-9671-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Christoph Hellwig Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/brd.c | 2 ++ drivers/block/zram/zram_drv.c | 2 +- drivers/nvdimm/btt.c | 3 +++ drivers/nvdimm/pmem.c | 2 ++ include/linux/backing-dev.h | 8 ++++++++ 5 files changed, 16 insertions(+), 1 deletion(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c1cf87718c2e..588360d79fca 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_BLK_DEV_RAM_DAX #include #include @@ -448,6 +449,7 @@ static struct brd_device *brd_alloc(int i) disk->flags = GENHD_FL_EXT_DEVT; sprintf(disk->disk_name, "ram%d", i); set_capacity(disk, rd_size * 2); + disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; #ifdef CONFIG_BLK_DEV_RAM_DAX queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d95bb8ce5092..88564222f473 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1558,7 +1558,7 @@ static int zram_add(void) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); zram->disk->queue->backing_dev_info->capabilities |= - BDI_CAP_STABLE_WRITES; + (BDI_CAP_STABLE_WRITES | BDI_CAP_SYNCHRONOUS_IO); add_disk(zram->disk); ret = sysfs_create_group(&disk_to_dev(zram->disk)->kobj, diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index d5612bd1cc81..e949e3302af4 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "btt.h" #include "nd.h" @@ -1402,6 +1403,8 @@ static int btt_blk_init(struct btt *btt) btt->btt_disk->private_data = btt; btt->btt_disk->queue = btt->btt_queue; btt->btt_disk->flags = GENHD_FL_EXT_DEVT; + btt->btt_disk->queue->backing_dev_info->capabilities |= + BDI_CAP_SYNCHRONOUS_IO; blk_queue_make_request(btt->btt_queue, btt_make_request); blk_queue_logical_block_size(btt->btt_queue, btt->sector_size); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 39dfd7affa31..7fbc5c5dc8e1 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "pmem.h" #include "pfn.h" #include "nd.h" @@ -394,6 +395,7 @@ static int pmem_attach_disk(struct device *dev, disk->fops = &pmem_fops; disk->queue = q; disk->flags = GENHD_FL_EXT_DEVT; + disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; nvdimm_namespace_disk_name(ndns, disk->disk_name); set_capacity(disk, (pmem->size - pmem->pfn_pad - pmem->data_offset) / 512); diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index f41ca8486e02..7360e79e9a2f 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -122,6 +122,8 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); * BDI_CAP_STRICTLIMIT: Keep number of dirty pages below bdi threshold. * * BDI_CAP_CGROUP_WRITEBACK: Supports cgroup-aware writeback. + * BDI_CAP_SYNCHRONOUS_IO: Device is so fast that asynchronous IO would be + * inefficient. */ #define BDI_CAP_NO_ACCT_DIRTY 0x00000001 #define BDI_CAP_NO_WRITEBACK 0x00000002 @@ -129,6 +131,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #define BDI_CAP_STABLE_WRITES 0x00000008 #define BDI_CAP_STRICTLIMIT 0x00000010 #define BDI_CAP_CGROUP_WRITEBACK 0x00000020 +#define BDI_CAP_SYNCHRONOUS_IO 0x00000040 #define BDI_CAP_NO_ACCT_AND_WRITEBACK \ (BDI_CAP_NO_WRITEBACK | BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_ACCT_WB) @@ -174,6 +177,11 @@ static inline int wb_congested(struct bdi_writeback *wb, int cong_bits) long congestion_wait(int sync, long timeout); long wait_iff_congested(struct pglist_data *pgdat, int sync, long timeout); +static inline bool bdi_cap_synchronous_io(struct backing_dev_info *bdi) +{ + return bdi->capabilities & BDI_CAP_SYNCHRONOUS_IO; +} + static inline bool bdi_cap_stable_pages_required(struct backing_dev_info *bdi) { return bdi->capabilities & BDI_CAP_STABLE_WRITES; From 539a6fea7fdcade532bd3e77be2862a683f8f0c9 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:04 -0800 Subject: [PATCH 030/131] mm, swap: introduce SWP_SYNCHRONOUS_IO If rw-page based fast storage is used for swap devices, we need to detect it to enhance swap IO operations. This patch is preparation for optimizing of swap-in operation with next patch. Link: http://lkml.kernel.org/r/1505886205-9671-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Dan Williams Cc: Ilya Dryomov Cc: Jens Axboe Cc: Ross Zwisler Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- mm/swapfile.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index f02fb5db8914..933d7c0c3542 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -171,8 +171,9 @@ enum { SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ + SWP_SYNCHRONOUS_IO = (1 << 11), /* synchronous IO is efficient */ /* add others here before... */ - SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 12), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/swapfile.c b/mm/swapfile.c index e47a21e64764..cb08fa65819f 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3169,6 +3169,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) p->flags |= SWP_STABLE_WRITES; + if (bdi_cap_synchronous_io(inode_to_bdi(inode))) + p->flags |= SWP_SYNCHRONOUS_IO; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; unsigned long ci, nr_cluster; From 0bcac06f27d7528591c27ac2b093ccd71c5d0168 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:07 -0800 Subject: [PATCH 031/131] mm, swap: skip swapcache for swapin of synchronous device With fast swap storage, the platforms want to use swap more aggressively and swap-in is crucial to application latency. The rw_page() based synchronous devices like zram, pmem and btt are such fast storage. When I profile swapin performance with zram lz4 decompress test, S/W overhead is more than 70%. Maybe, it would be bigger in nvdimm. This patch aims to reduce swap-in latency by skipping swapcache if the swap device is synchronous device like rw_page based device. It enhances 45% my swapin test(5G sequential swapin, no readahead, from 2.41sec to 1.64sec). Link: http://lkml.kernel.org/r/1505886205-9671-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 ++++++++++ mm/memory.c | 52 ++++++++++++++++++++++++++++++-------------- mm/page_io.c | 6 ++--- mm/swapfile.c | 11 ++++++---- 4 files changed, 57 insertions(+), 23 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 933d7c0c3542..32c06f028c7b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -466,6 +466,7 @@ extern int page_swapcount(struct page *); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); +extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; @@ -474,6 +475,16 @@ extern void exit_swap_address_space(unsigned int type); #else /* CONFIG_SWAP */ +static inline int swap_readpage(struct page *page, bool do_poll) +{ + return 0; +} + +static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return NULL; +} + #define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L diff --git a/mm/memory.c b/mm/memory.c index cae514e7dcfc..f75bff2cf662 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2842,7 +2842,7 @@ EXPORT_SYMBOL(unmap_mapping_range); int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct page *page = NULL, *swapcache; + struct page *page = NULL, *swapcache = NULL; struct mem_cgroup *memcg; struct vma_swap_readahead swap_ra; swp_entry_t entry; @@ -2881,17 +2881,35 @@ int do_swap_page(struct vm_fault *vmf) } goto out; } + + delayacct_set_flag(DELAYACCT_PF_SWAPIN); if (!page) page = lookup_swap_cache(entry, vma_readahead ? vma : NULL, vmf->address); if (!page) { - if (vma_readahead) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); + struct swap_info_struct *si = swp_swap_info(entry); + + if (!(si->flags & SWP_SYNCHRONOUS_IO)) { + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); + swapcache = page; + } else { + /* skip swapcache */ + page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); + if (page) { + __SetPageLocked(page); + __SetPageSwapBacked(page); + set_page_private(page, entry.val); + lru_cache_add_anon(page); + swap_readpage(page, true); + } + } + if (!page) { /* * Back out if somebody else faulted in this pte @@ -2920,7 +2938,6 @@ int do_swap_page(struct vm_fault *vmf) goto out_release; } - swapcache = page; locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); @@ -2935,7 +2952,8 @@ int do_swap_page(struct vm_fault *vmf) * test below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not changed. */ - if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + if (unlikely((!PageSwapCache(page) || + page_private(page) != entry.val)) && swapcache) goto out_page; page = ksm_might_need_to_copy(page, vma, vmf->address); @@ -2988,14 +3006,16 @@ int do_swap_page(struct vm_fault *vmf) pte = pte_mksoft_dirty(pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); vmf->orig_pte = pte; - if (page == swapcache) { - do_page_add_anon_rmap(page, vma, vmf->address, exclusive); - mem_cgroup_commit_charge(page, memcg, true, false); - activate_page(page); - } else { /* ksm created a completely new copy */ + + /* ksm created a completely new copy */ + if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); + } else { + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); + mem_cgroup_commit_charge(page, memcg, true, false); + activate_page(page); } swap_free(entry); @@ -3003,7 +3023,7 @@ int do_swap_page(struct vm_fault *vmf) (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); - if (page != swapcache) { + if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -3036,7 +3056,7 @@ out_page: unlock_page(page); out_release: put_page(page); - if (page != swapcache) { + if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); } diff --git a/mm/page_io.c b/mm/page_io.c index cd52b9cc169b..e93f1a4cacd7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -347,7 +347,7 @@ out: return ret; } -int swap_readpage(struct page *page, bool do_poll) +int swap_readpage(struct page *page, bool synchronous) { struct bio *bio; int ret = 0; @@ -355,7 +355,7 @@ int swap_readpage(struct page *page, bool do_poll) blk_qc_t qc; struct gendisk *disk; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); + VM_BUG_ON_PAGE(!PageSwapCache(page) && !synchronous, page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageUptodate(page), page); if (frontswap_load(page) == 0) { @@ -403,7 +403,7 @@ int swap_readpage(struct page *page, bool do_poll) count_vm_event(PSWPIN); bio_get(bio); qc = submit_bio(bio); - while (do_poll) { + while (synchronous) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio->bi_private)) break; diff --git a/mm/swapfile.c b/mm/swapfile.c index cb08fa65819f..85aff8a42801 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -3455,10 +3455,15 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +struct swap_info_struct *swp_swap_info(swp_entry_t entry) +{ + return swap_info[swp_type(entry)]; +} + struct swap_info_struct *page_swap_info(struct page *page) { - swp_entry_t swap = { .val = page_private(page) }; - return swap_info[swp_type(swap)]; + swp_entry_t entry = { .val = page_private(page) }; + return swp_swap_info(entry); } /* @@ -3466,7 +3471,6 @@ struct swap_info_struct *page_swap_info(struct page *page) */ struct address_space *__page_file_mapping(struct page *page) { - VM_BUG_ON_PAGE(!PageSwapCache(page), page); return page_swap_info(page)->swap_file->f_mapping; } EXPORT_SYMBOL_GPL(__page_file_mapping); @@ -3474,7 +3478,6 @@ EXPORT_SYMBOL_GPL(__page_file_mapping); pgoff_t __page_file_index(struct page *page) { swp_entry_t swap = { .val = page_private(page) }; - VM_BUG_ON_PAGE(!PageSwapCache(page), page); return swp_offset(swap); } EXPORT_SYMBOL_GPL(__page_file_index); From aa8d22a11da933dbf880b4933b58931f4aefe91c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 15 Nov 2017 17:33:11 -0800 Subject: [PATCH 032/131] mm: swap: SWP_SYNCHRONOUS_IO: skip swapcache only if swapped page has no other reference When SWP_SYNCHRONOUS_IO swapped-in pages are shared by several processes, it can cause unnecessary memory wastage by skipping swap cache. Because, with swapin fault by read, they could share a page if the page were in swap cache. Thus, it avoids allocating same content new pages. This patch makes the swapcache skipping work only if the swap pte is non-sharable. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/1507620825-5537-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Cc: Dan Williams Cc: Ross Zwisler Cc: Hugh Dickins Cc: Christoph Hellwig Cc: Ilya Dryomov Cc: Jens Axboe Cc: Sergey Senozhatsky Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 6 ++++++ mm/memory.c | 19 ++++++++++--------- mm/swapfile.c | 7 +++++++ 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 32c06f028c7b..8b8a6f965785 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -463,6 +463,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern int __swap_count(struct swap_info_struct *si, swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); @@ -589,6 +590,11 @@ static inline int page_swapcount(struct page *page) return 0; } +static inline int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + return 0; +} + static inline int __swp_swapcount(swp_entry_t entry) { return 0; diff --git a/mm/memory.c b/mm/memory.c index f75bff2cf662..a4518aedf4dd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2890,15 +2890,8 @@ int do_swap_page(struct vm_fault *vmf) if (!page) { struct swap_info_struct *si = swp_swap_info(entry); - if (!(si->flags & SWP_SYNCHRONOUS_IO)) { - if (vma_readahead) - page = do_swap_page_readahead(entry, - GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); - else - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, vmf->address); - swapcache = page; - } else { + if (si->flags & SWP_SYNCHRONOUS_IO && + __swap_count(si, entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { @@ -2908,6 +2901,14 @@ int do_swap_page(struct vm_fault *vmf) lru_cache_add_anon(page); swap_readpage(page, true); } + } else { + if (vma_readahead) + page = do_swap_page_readahead(entry, + GFP_HIGHUSER_MOVABLE, vmf, &swap_ra); + else + page = swapin_readahead(entry, + GFP_HIGHUSER_MOVABLE, vma, vmf->address); + swapcache = page; } if (!page) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 85aff8a42801..3074b02eaa09 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1328,6 +1328,13 @@ int page_swapcount(struct page *page) return count; } +int __swap_count(struct swap_info_struct *si, swp_entry_t entry) +{ + pgoff_t offset = swp_offset(entry); + + return swap_count(si->swap_map[offset]); +} + static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { int count = 0; From e9a6effa500526e2a19d5ad042cb758b55b1ef93 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 15 Nov 2017 17:33:15 -0800 Subject: [PATCH 033/131] mm, swap: fix false error message in __swp_swapcount() When a page fault occurs for a swap entry, the physical swap readahead (not the VMA base swap readahead) may readahead several swap entries after the fault swap entry. The readahead algorithm calculates some of the swap entries to readahead via increasing the offset of the fault swap entry without checking whether they are beyond the end of the swap device and it relys on the __swp_swapcount() and swapcache_prepare() to check it. Although __swp_swapcount() checks for the swap entry passed in, it will complain with the error message as follow for the expected invalid swap entry. This may make the end users confused. swap_info_get: Bad swap offset entry 0200f8a7 To fix the false error message, the swap entry checking is added in swapin_readahead() to avoid to pass the out-of-bound swap entries and the swap entry reserved for the swap header to __swp_swapcount() and swapcache_prepare(). Link: http://lkml.kernel.org/r/20171102054225.22897-1-ying.huang@intel.com Fixes: e8c26ab60598 ("mm/swap: skip readahead for unreferenced swap slots") Signed-off-by: "Huang, Ying" Reported-by: Christian Kujau Acked-by: Minchan Kim Suggested-by: Minchan Kim Cc: Tim Chen Cc: Michal Hocko Cc: Hugh Dickins Cc: [4.11+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/swap_state.c b/mm/swap_state.c index 326439428daf..f2face8b889e 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -559,6 +559,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = entry_offset; unsigned long start_offset, end_offset; unsigned long mask; + struct swap_info_struct *si = swp_swap_info(entry); struct blk_plug plug; bool do_poll = true, page_allocated; @@ -572,6 +573,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, end_offset = offset | mask; if (!start_offset) /* First page is swap header. */ start_offset++; + if (end_offset >= si->max) + end_offset = si->max - 1; blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { From 4c578dce58038a5b3cb7ffc77a5f62ef2c5d0856 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Wed, 15 Nov 2017 17:33:19 -0800 Subject: [PATCH 034/131] mm/page-writeback.c: remove unused parameter from balance_dirty_pages() "mapping" parameter to balance_dirty_pages() is not used anymore. Fixes: dfb8ae567835 ("writeback: let balance_dirty_pages() work on the matching cgroup bdi_writeback") Link: http://lkml.kernel.org/r/20170927221311.23263-1-tahsin@google.com Signed-off-by: Tahsin Erdogan Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c518c845f202..76a43c17761b 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1559,8 +1559,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * If we're over `background_thresh' then the writeback threads are woken to * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping, - struct bdi_writeback *wb, +static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long pages_dirtied) { struct dirty_throttle_control gdtc_stor = { GDTC_INIT(wb) }; @@ -1910,7 +1909,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) preempt_enable(); if (unlikely(current->nr_dirtied >= ratelimit)) - balance_dirty_pages(mapping, wb, current->nr_dirtied); + balance_dirty_pages(wb, current->nr_dirtied); wb_put(wb); } From d7b236e10cedd95373a79fd53b7e9c105bea4f08 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:22 -0800 Subject: [PATCH 035/131] mm: drop migrate type checks from has_unmovable_pages Michael has noticed that the memory offline tries to migrate kernel code pages when doing echo 0 > /sys/devices/system/memory/memory0/online The current implementation will fail the operation after several failed page migration attempts but we shouldn't even attempt to migrate that memory and fail right away because this memory is clearly not migrateable. This will become a real problem when we drop the retry loop counter resp. timeout. The real problem is in has_unmovable_pages in fact. We should fail if there are any non migrateable pages in the area. In orther to guarantee that remove the migrate type checks because MIGRATE_MOVABLE is not guaranteed to contain only migrateable pages. It is merely a heuristic. Similarly MIGRATE_CMA does guarantee that the page allocator doesn't allocate any non-migrateable pages from the block but CMA allocations themselves are unlikely to migrateable. Therefore remove both checks. [akpm@linux-foundation.org: remove unused local `mt'] Link: http://lkml.kernel.org/r/20171013120013.698-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Michael Ellerman Tested-by: Michael Ellerman Acked-by: Vlastimil Babka Tested-by: Tony Lindgren Tested-by: Ran Wang Cc: Igor Mammedov Cc: KAMEZAWA Hiroyuki Cc: Reza Arbab Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 77e4d3c5c57b..e6d4234e0d1a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7356,7 +7356,6 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; - int mt; /* * For avoiding noise data, lru_add_drain_all() should be called @@ -7364,9 +7363,6 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, */ if (zone_idx(zone) == ZONE_MOVABLE) return false; - mt = get_pageblock_migratetype(page); - if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) - return false; pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { From 4da2ce250f986060750fcc5b29112914e31803ba Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:26 -0800 Subject: [PATCH 036/131] mm: distinguish CMA and MOVABLE isolation in has_unmovable_pages() Joonsoo has noticed that "mm: drop migrate type checks from has_unmovable_pages" would break CMA allocator because it relies on has_unmovable_pages returning false even for CMA pageblocks which in fact don't have to be movable: alloc_contig_range start_isolate_page_range set_migratetype_isolate has_unmovable_pages This is a result of the code sharing between CMA and memory hotplug while each one has a different idea of what has_unmovable_pages should return. This is unfortunate but fixing it properly would require a lot of code duplication. Fix the issue by introducing the requested migrate type argument and special case MIGRATE_CMA case where CMA page blocks are handled properly. This will work for memory hotplug because it requires MIGRATE_MOVABLE. Link: http://lkml.kernel.org/r/20171019122118.y6cndierwl2vnguj@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Joonsoo Kim Tested-by: Stefan Wahren Tested-by: Ran Wang Cc: Michael Ellerman Cc: Vlastimil Babka Cc: Igor Mammedov Cc: KAMEZAWA Hiroyuki Cc: Reza Arbab Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-isolation.h | 2 +- mm/page_alloc.c | 12 +++++++++++- mm/page_isolation.c | 10 +++++----- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 05a04e603686..cdad58bbfd8b 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -31,7 +31,7 @@ static inline bool is_migrate_isolate(int migratetype) #endif bool has_unmovable_pages(struct zone *zone, struct page *page, int count, - bool skip_hwpoisoned_pages); + int migratetype, bool skip_hwpoisoned_pages); void set_pageblock_migratetype(struct page *page, int migratetype); int move_freepages_block(struct zone *zone, struct page *page, int migratetype, int *num_movable); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e6d4234e0d1a..755f35f4bc8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7353,6 +7353,7 @@ void *__init alloc_large_system_hash(const char *tablename, * race condition. So you can't expect this function should be exact. */ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + int migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; @@ -7364,6 +7365,15 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, if (zone_idx(zone) == ZONE_MOVABLE) return false; + /* + * CMA allocations (alloc_contig_range) really need to mark isolate + * CMA pageblocks even when they are not movable in fact so consider + * them movable here. + */ + if (is_migrate_cma(migratetype) && + is_migrate_cma(get_pageblock_migratetype(page))) + return false; + pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { unsigned long check = pfn + iter; @@ -7446,7 +7456,7 @@ bool is_pageblock_removable_nolock(struct page *page) if (!zone_spans_pfn(zone, pfn)) return false; - return !has_unmovable_pages(zone, page, 0, true); + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); } #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 44f213935bf6..165ed8117bd1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -15,7 +15,7 @@ #define CREATE_TRACE_POINTS #include -static int set_migratetype_isolate(struct page *page, +static int set_migratetype_isolate(struct page *page, int migratetype, bool skip_hwpoisoned_pages) { struct zone *zone; @@ -52,7 +52,7 @@ static int set_migratetype_isolate(struct page *page, * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found, + if (!has_unmovable_pages(zone, page, arg.pages_found, migratetype, skip_hwpoisoned_pages)) ret = 0; @@ -64,14 +64,14 @@ static int set_migratetype_isolate(struct page *page, out: if (!ret) { unsigned long nr_pages; - int migratetype = get_pageblock_migratetype(page); + int mt = get_pageblock_migratetype(page); set_pageblock_migratetype(page, MIGRATE_ISOLATE); zone->nr_isolate_pageblock++; nr_pages = move_freepages_block(zone, page, MIGRATE_ISOLATE, NULL); - __mod_zone_freepage_state(zone, -nr_pages, migratetype); + __mod_zone_freepage_state(zone, -nr_pages, mt); } spin_unlock_irqrestore(&zone->lock, flags); @@ -183,7 +183,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); if (page && - set_migratetype_isolate(page, skip_hwpoisoned_pages)) { + set_migratetype_isolate(page, migratetype, skip_hwpoisoned_pages)) { undo_pfn = pfn; goto undo; } From d7ab3672c3ff7b2a2be3f15fcee77414fd9c4d7a Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:30 -0800 Subject: [PATCH 037/131] mm, page_alloc: fail has_unmovable_pages when seeing reserved pages Reserved pages should be completely ignored by the core mm because they have a special meaning for their owners. has_unmovable_pages doesn't check those so we rely on other tests (reference count, or PageLRU) to fail on such pages. Althought this happens to work it is safer to simply check for those explicitly and do not rely on the owner of the page to abuse those fields for special purposes. Please note that this is more of a further fortification of the code rahter than a fix of an existing issue. Link: http://lkml.kernel.org/r/20171013120756.jeopthigbmm3c7bl@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Igor Mammedov Cc: KAMEZAWA Hiroyuki Cc: Michael Ellerman Cc: Reza Arbab Cc: Vitaly Kuznetsov Cc: Xishi Qiu Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 755f35f4bc8b..e6106d7e9eb0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7383,6 +7383,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count, page = pfn_to_page(check); + if (PageReserved(page)) + return true; + /* * Hugepages are not in LRU lists, but they're movable. * We need not scan over tail pages bacause we don't From 72b39cfc4d750e5b8c633a7a6fdd7d07927995ad Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:34 -0800 Subject: [PATCH 038/131] mm, memory_hotplug: do not fail offlining too early Patch series "mm, memory_hotplug: redefine memory offline retry logic", v2. While testing memory hotplug on a large 4TB machine we have noticed that memory offlining is just too eager to fail. The primary reason is that the retry logic is just too easy to give up. We have 4 ways out of the offline - we have a permanent failure (isolation or memory notifiers fail, or hugetlb pages cannot be dropped) - userspace sends a signal - a hardcoded 120s timeout expires - page migration fails 5 times This is way too convoluted and it doesn't scale very well. We have seen both temporary migration failures as well as 120s being triggered. After removing those restrictions we were able to pass stress testing during memory hot remove without any other negative side effects observed. Therefore I suggest dropping both hard coded policies. I couldn't have found any specific reason for them in the changelog. I neither didn't get any response [1] from Kamezawa. If we need some upper bound - e.g. timeout based - then we should have a proper and user defined policy for that. In any case there should be a clear use case when introducing it. This patch (of 2): Memory offlining can fail too eagerly under heavy memory pressure. page:ffffea22a646bd00 count:255 mapcount:252 mapping:ffff88ff926c9f38 index:0x3 flags: 0x9855fe40010048(uptodate|active|mappedtodisk) page dumped because: isolation failed page->mem_cgroup:ffff8801cd662000 memory offlining [mem 0x18b580000000-0x18b5ffffffff] failed Isolation has failed here because the page is not on LRU. Most probably because it was on the pcp LRU cache or it has been removed from the LRU already but it hasn't been freed yet. In both cases the page doesn't look non-migrable so retrying more makes sense. __offline_pages seems rather cluttered when it comes to the retry logic. We have 5 retries at maximum and a timeout. We could argue whether the timeout makes sense but failing just because of a race when somebody isoltes a page from LRU or puts it on a pcp LRU lists is just wrong. It only takes it to race with a process which unmaps some pages and remove them from the LRU list and we can fail the whole offline because of something that is a temporary condition and actually not harmful for the offline. Please note that unmovable pages should be already excluded during start_isolate_page_range. We could argue that has_unmovable_pages is racy and MIGRATE_MOVABLE check doesn't provide any hard guarantee either but kernel zones (aka < ZONE_MOVABLE) will very likely detect unmovable pages in most cases and movable zone shouldn't contain unmovable pages at all. Some of those pages might be pinned but not for ever because that would be a bug on its own. In any case the context is still interruptible and so the userspace can easily bail out when the operation takes too long. This is certainly better behavior than a hardcoded retry loop which is racy. Fix this by removing the max retry count and only rely on the timeout resp. interruption by a signal from the userspace. Also retry rather than fail when check_pages_isolated sees some !free pages because those could be a result of the race as well. Link: http://lkml.kernel.org/r/20170918070834.13083-2-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: KAMEZAWA Hiroyuki Cc: Reza Arbab Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Igor Mammedov Cc: Vitaly Kuznetsov Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index d4b5f29906b9..014e9090cb77 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1594,7 +1594,7 @@ static int __ref __offline_pages(unsigned long start_pfn, { unsigned long pfn, nr_pages, expire; long offlined_pages; - int ret, drain, retry_max, node; + int ret, node; unsigned long flags; unsigned long valid_start, valid_end; struct zone *zone; @@ -1631,43 +1631,25 @@ static int __ref __offline_pages(unsigned long start_pfn, pfn = start_pfn; expire = jiffies + timeout; - drain = 0; - retry_max = 5; repeat: /* start memory hot removal */ - ret = -EAGAIN; + ret = -EBUSY; if (time_after(jiffies, expire)) goto failed_removal; ret = -EINTR; if (signal_pending(current)) goto failed_removal; - ret = 0; - if (drain) { - lru_add_drain_all_cpuslocked(); - cond_resched(); - drain_all_pages(zone); - } + + cond_resched(); + lru_add_drain_all_cpuslocked(); + drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); if (pfn) { /* We have movable pages */ ret = do_migrate_range(pfn, end_pfn); - if (!ret) { - drain = 1; - goto repeat; - } else { - if (ret < 0) - if (--retry_max == 0) - goto failed_removal; - yield(); - drain = 1; - goto repeat; - } + goto repeat; } - /* drain all zone's lru pagevec, this is asynchronous... */ - lru_add_drain_all_cpuslocked(); - yield(); - /* drain pcp pages, this is synchronous. */ - drain_all_pages(zone); + /* * dissolve free hugepages in the memory block before doing offlining * actually in order to make hugetlbfs's object counting consistent. @@ -1677,10 +1659,8 @@ repeat: goto failed_removal; /* check again */ offlined_pages = check_pages_isolated(start_pfn, end_pfn); - if (offlined_pages < 0) { - ret = -EBUSY; - goto failed_removal; - } + if (offlined_pages < 0) + goto repeat; pr_info("Offlined Pages %ld\n", offlined_pages); /* Ok, all of our target is isolated. We cannot do rollback at this point. */ From ecde0f3e7f9edf8629f56b2354385dc8d0a6a24d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:33:38 -0800 Subject: [PATCH 039/131] mm, memory_hotplug: remove timeout from __offline_memory We have a hardcoded 120s timeout after which the memory offline fails basically since the hot remove has been introduced. This is essentially a policy implemented in the kernel. Moreover there is no way to adjust the timeout and so we are sometimes facing memory offline failures if the system is under a heavy memory pressure or very intensive CPU workload on large machines. It is not very clear what purpose the timeout actually serves. The offline operation is interruptible by a signal so if userspace wants some timeout based termination this can be done trivially by sending a signal. If there is a strong usecase to do this from the kernel then we should do it properly and have a it tunable from the userspace with the timeout disabled by default along with the explanation who uses it and for what purporse. Link: http://lkml.kernel.org/r/20170918070834.13083-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: KAMEZAWA Hiroyuki Cc: Reza Arbab Cc: Yasuaki Ishimatsu Cc: Xishi Qiu Cc: Igor Mammedov Cc: Vitaly Kuznetsov Cc: Michael Ellerman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 014e9090cb77..fab51a6af962 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1590,9 +1590,9 @@ static void node_states_clear_node(int node, struct memory_notify *arg) } static int __ref __offline_pages(unsigned long start_pfn, - unsigned long end_pfn, unsigned long timeout) + unsigned long end_pfn) { - unsigned long pfn, nr_pages, expire; + unsigned long pfn, nr_pages; long offlined_pages; int ret, node; unsigned long flags; @@ -1630,12 +1630,8 @@ static int __ref __offline_pages(unsigned long start_pfn, goto failed_removal; pfn = start_pfn; - expire = jiffies + timeout; repeat: /* start memory hot removal */ - ret = -EBUSY; - if (time_after(jiffies, expire)) - goto failed_removal; ret = -EINTR; if (signal_pending(current)) goto failed_removal; @@ -1708,7 +1704,7 @@ failed_removal: /* Must be protected by mem_hotplug_begin() or a device_lock */ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { - return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); + return __offline_pages(start_pfn, start_pfn + nr_pages); } #endif /* CONFIG_MEMORY_HOTREMOVE */ From 66e8b438bd5c75498cfe915c4219049eaebcb869 Mon Sep 17 00:00:00 2001 From: Gioh Kim Date: Wed, 15 Nov 2017 17:33:42 -0800 Subject: [PATCH 040/131] mm/memblock.c: make the index explicit argument of for_each_memblock_type for_each_memblock_type macro function relies on idx variable defined in the caller context. Silent macro arguments are almost always wrong thing to do. They make code harder to read and easier to get wrong. Let's use an explicit iterator parameter for for_each_memblock_type and make the code more obious. This patch is a mere cleanup and it shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20170913133029.28911-1-gi-oh.kim@profitbricks.com Signed-off-by: Gioh Kim Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 8 ++++---- mm/memblock.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index bae11c7e7bf3..ce0e5634c2f9 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -389,10 +389,10 @@ static inline unsigned long memblock_region_reserved_end_pfn(const struct memblo region < (memblock.memblock_type.regions + memblock.memblock_type.cnt); \ region++) -#define for_each_memblock_type(memblock_type, rgn) \ - for (idx = 0, rgn = &memblock_type->regions[0]; \ - idx < memblock_type->cnt; \ - idx++, rgn = &memblock_type->regions[idx]) +#define for_each_memblock_type(i, memblock_type, rgn) \ + for (i = 0, rgn = &memblock_type->regions[0]; \ + i < memblock_type->cnt; \ + i++, rgn = &memblock_type->regions[i]) #ifdef CONFIG_MEMTEST extern void early_memtest(phys_addr_t start, phys_addr_t end); diff --git a/mm/memblock.c b/mm/memblock.c index 91205780e6b1..18dbb69086bc 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -533,7 +533,7 @@ repeat: base = obase; nr_new = 0; - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -637,7 +637,7 @@ static int __init_memblock memblock_isolate_range(struct memblock_type *type, if (memblock_double_array(type, base, size) < 0) return -ENOMEM; - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { phys_addr_t rbase = rgn->base; phys_addr_t rend = rbase + rgn->size; @@ -1715,7 +1715,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type) pr_info(" %s.cnt = 0x%lx\n", type->name, type->cnt); - for_each_memblock_type(type, rgn) { + for_each_memblock_type(idx, type, rgn) { char nid_buf[32] = ""; base = rgn->base; @@ -1739,7 +1739,7 @@ memblock_reserved_memory_within(phys_addr_t start_addr, phys_addr_t end_addr) unsigned long size = 0; int idx; - for_each_memblock_type((&memblock.reserved), rgn) { + for_each_memblock_type(idx, (&memblock.reserved), rgn) { phys_addr_t start, end; if (rgn->base + rgn->size < start_addr) From 0f6d24f878568fac579a1962d0bf7cb9f01e0ceb Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Wed, 15 Nov 2017 17:33:45 -0800 Subject: [PATCH 041/131] mm/page-writeback.c: print a warning if the vm dirtiness settings are illogical The vm direct limit setting must be set greater than vm background limit setting. Otherwise print a warning to help the operator to figure out that the vm dirtiness settings is in illogical state. Link: http://lkml.kernel.org/r/1506592464-30962-1-git-send-email-laoar.shao@gmail.com Signed-off-by: Yafang Shao Cc: Jan Kara Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 7 +++++++ mm/page-writeback.c | 5 ++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 9baf66a9ef4e..30fd16b14196 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -157,6 +157,10 @@ Note: the minimum value allowed for dirty_bytes is two pages (in bytes); any value lower than this limit will be ignored and the old configuration will be retained. +Note: the value of dirty_bytes also must be set greater than +dirty_background_bytes or the amount of memory corresponding to +dirty_background_ratio. + ============================================================== dirty_expire_centisecs @@ -176,6 +180,9 @@ generating disk writes will itself start writing out dirty data. The total available memory is not equal to total system memory. +Note: dirty_ratio must be set greater than dirty_background_ratio or +ratio corresponding to dirty_background_bytes. + ============================================================== dirty_writeback_centisecs diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 76a43c17761b..768fe4e37e6a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -433,8 +433,11 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) else bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; - if (bg_thresh >= thresh) + if (unlikely(bg_thresh >= thresh)) { + pr_warn("vm direct limit must be set greater than background limit.\n"); bg_thresh = thresh / 2; + } + tsk = current; if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; From 5ef3a8b12556d7fcba81edc74e9d85b029615ae0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2017 17:33:49 -0800 Subject: [PATCH 042/131] zram: add zstd to the supported algorithms list Add ZSTD to the list of supported compression algorithms. ZRAM fio perf test: LZO DEFLATE ZSTD #jobs1 WRITE: (2180MB/s) (77.2MB/s) (1429MB/s) WRITE: (1617MB/s) (77.7MB/s) (1202MB/s) READ: (426MB/s) (595MB/s) (1181MB/s) READ: (422MB/s) (572MB/s) (1020MB/s) READ: (318MB/s) (67.8MB/s) (563MB/s) WRITE: (318MB/s) (67.9MB/s) (564MB/s) READ: (336MB/s) (68.3MB/s) (583MB/s) WRITE: (335MB/s) (68.2MB/s) (582MB/s) #jobs2 WRITE: (3441MB/s) (152MB/s) (2141MB/s) WRITE: (2507MB/s) (147MB/s) (1888MB/s) READ: (801MB/s) (1146MB/s) (1890MB/s) READ: (767MB/s) (1096MB/s) (2073MB/s) READ: (621MB/s) (126MB/s) (1009MB/s) WRITE: (621MB/s) (126MB/s) (1009MB/s) READ: (656MB/s) (125MB/s) (1075MB/s) WRITE: (657MB/s) (126MB/s) (1077MB/s) #jobs3 WRITE: (4772MB/s) (225MB/s) (3394MB/s) WRITE: (3905MB/s) (211MB/s) (2939MB/s) READ: (1216MB/s) (1608MB/s) (3218MB/s) READ: (1159MB/s) (1431MB/s) (2981MB/s) READ: (906MB/s) (156MB/s) (1457MB/s) WRITE: (907MB/s) (156MB/s) (1458MB/s) READ: (953MB/s) (158MB/s) (1595MB/s) WRITE: (952MB/s) (157MB/s) (1593MB/s) #jobs4 WRITE: (6036MB/s) (265MB/s) (4469MB/s) WRITE: (5059MB/s) (263MB/s) (3951MB/s) READ: (1618MB/s) (2066MB/s) (4276MB/s) READ: (1573MB/s) (1942MB/s) (3830MB/s) READ: (1202MB/s) (227MB/s) (1971MB/s) WRITE: (1200MB/s) (227MB/s) (1968MB/s) READ: (1265MB/s) (226MB/s) (2116MB/s) WRITE: (1264MB/s) (226MB/s) (2114MB/s) #jobs5 WRITE: (5339MB/s) (233MB/s) (3781MB/s) WRITE: (4298MB/s) (234MB/s) (3276MB/s) READ: (1626MB/s) (2048MB/s) (4081MB/s) READ: (1567MB/s) (1929MB/s) (3758MB/s) READ: (1174MB/s) (205MB/s) (1747MB/s) WRITE: (1173MB/s) (204MB/s) (1746MB/s) READ: (1214MB/s) (208MB/s) (1890MB/s) WRITE: (1215MB/s) (208MB/s) (1892MB/s) #jobs6 WRITE: (5666MB/s) (270MB/s) (4338MB/s) WRITE: (4828MB/s) (267MB/s) (3772MB/s) READ: (1803MB/s) (2058MB/s) (4946MB/s) READ: (1805MB/s) (2156MB/s) (4711MB/s) READ: (1334MB/s) (235MB/s) (2135MB/s) WRITE: (1335MB/s) (235MB/s) (2137MB/s) READ: (1364MB/s) (236MB/s) (2268MB/s) WRITE: (1365MB/s) (237MB/s) (2270MB/s) #jobs7 WRITE: (5474MB/s) (270MB/s) (4300MB/s) WRITE: (4666MB/s) (266MB/s) (3817MB/s) READ: (2022MB/s) (2319MB/s) (5472MB/s) READ: (1924MB/s) (2260MB/s) (5031MB/s) READ: (1369MB/s) (242MB/s) (2153MB/s) WRITE: (1370MB/s) (242MB/s) (2155MB/s) READ: (1499MB/s) (246MB/s) (2310MB/s) WRITE: (1497MB/s) (246MB/s) (2307MB/s) #jobs8 WRITE: (5558MB/s) (273MB/s) (4439MB/s) WRITE: (4763MB/s) (271MB/s) (3918MB/s) READ: (2201MB/s) (2599MB/s) (6062MB/s) READ: (2105MB/s) (2463MB/s) (5413MB/s) READ: (1490MB/s) (252MB/s) (2238MB/s) WRITE: (1488MB/s) (252MB/s) (2236MB/s) READ: (1566MB/s) (254MB/s) (2434MB/s) WRITE: (1568MB/s) (254MB/s) (2437MB/s) #jobs9 WRITE: (5120MB/s) (264MB/s) (4035MB/s) WRITE: (4531MB/s) (267MB/s) (3740MB/s) READ: (1940MB/s) (2258MB/s) (4986MB/s) READ: (2024MB/s) (2387MB/s) (4871MB/s) READ: (1343MB/s) (246MB/s) (2038MB/s) WRITE: (1342MB/s) (246MB/s) (2037MB/s) READ: (1553MB/s) (238MB/s) (2243MB/s) WRITE: (1552MB/s) (238MB/s) (2242MB/s) #jobs10 WRITE: (5345MB/s) (271MB/s) (3988MB/s) WRITE: (4750MB/s) (254MB/s) (3668MB/s) READ: (1876MB/s) (2363MB/s) (5150MB/s) READ: (1990MB/s) (2256MB/s) (5080MB/s) READ: (1355MB/s) (250MB/s) (2019MB/s) WRITE: (1356MB/s) (251MB/s) (2020MB/s) READ: (1490MB/s) (252MB/s) (2202MB/s) WRITE: (1488MB/s) (252MB/s) (2199MB/s) jobs1 perfstat instructions 52,065,555,710 ( 0.79) 855,731,114,587 ( 2.64) 54,280,709,944 ( 1.40) branches 14,020,427,116 ( 725.847) 101,733,449,582 (1074.521) 11,170,591,067 ( 992.869) branch-misses 22,626,174 ( 0.16%) 274,197,885 ( 0.27%) 25,915,805 ( 0.23%) jobs2 perfstat instructions 103,633,110,402 ( 0.75) 1,710,822,100,914 ( 2.59) 107,879,874,104 ( 1.28) branches 27,931,237,282 ( 679.203) 203,298,267,479 (1037.326) 22,185,350,842 ( 884.427) branch-misses 46,103,811 ( 0.17%) 533,747,204 ( 0.26%) 49,682,483 ( 0.22%) jobs3 perfstat instructions 154,857,283,657 ( 0.76) 2,565,748,974,197 ( 2.57) 161,515,435,813 ( 1.31) branches 41,759,490,355 ( 670.529) 304,905,605,277 ( 978.765) 33,215,805,907 ( 888.003) branch-misses 74,263,293 ( 0.18%) 759,746,240 ( 0.25%) 76,841,196 ( 0.23%) jobs4 perfstat instructions 206,215,849,076 ( 0.75) 3,420,169,460,897 ( 2.60) 215,003,061,664 ( 1.31) branches 55,632,141,739 ( 666.501) 406,394,977,433 ( 927.241) 44,214,322,251 ( 883.532) branch-misses 102,287,788 ( 0.18%) 1,098,617,314 ( 0.27%) 103,891,040 ( 0.23%) jobs5 perfstat instructions 258,711,315,588 ( 0.67) 4,275,657,533,244 ( 2.23) 269,332,235,685 ( 1.08) branches 69,802,821,166 ( 588.823) 507,996,211,252 ( 797.036) 55,450,846,129 ( 735.095) branch-misses 129,217,214 ( 0.19%) 1,243,284,991 ( 0.24%) 173,512,278 ( 0.31%) jobs6 perfstat instructions 312,796,166,008 ( 0.61) 5,133,896,344,660 ( 2.02) 323,658,769,588 ( 1.04) branches 84,372,488,583 ( 520.541) 610,310,494,402 ( 697.642) 66,683,292,992 ( 693.939) branch-misses 159,438,978 ( 0.19%) 1,396,368,563 ( 0.23%) 174,406,934 ( 0.26%) jobs7 perfstat instructions 363,211,372,930 ( 0.56) 5,988,205,600,879 ( 1.75) 377,824,674,156 ( 0.93) branches 98,057,013,765 ( 463.117) 711,841,255,974 ( 598.762) 77,879,009,954 ( 600.443) branch-misses 199,513,153 ( 0.20%) 1,507,651,077 ( 0.21%) 248,203,369 ( 0.32%) jobs8 perfstat instructions 413,960,354,615 ( 0.52) 6,842,918,558,378 ( 1.45) 431,938,486,581 ( 0.83) branches 111,812,574,884 ( 414.224) 813,299,084,518 ( 491.173) 89,062,699,827 ( 517.795) branch-misses 233,584,845 ( 0.21%) 1,531,593,921 ( 0.19%) 286,818,489 ( 0.32%) jobs9 perfstat instructions 465,976,220,300 ( 0.53) 7,698,467,237,372 ( 1.47) 486,352,600,321 ( 0.84) branches 125,931,456,162 ( 424.063) 915,207,005,715 ( 498.192) 100,370,404,090 ( 517.439) branch-misses 256,992,445 ( 0.20%) 1,782,809,816 ( 0.19%) 345,239,380 ( 0.34%) jobs10 perfstat instructions 517,406,372,715 ( 0.53) 8,553,527,312,900 ( 1.48) 540,732,653,094 ( 0.84) branches 139,839,780,676 ( 427.732) 1,016,737,699,389 ( 503.172) 111,696,557,638 ( 516.750) branch-misses 259,595,561 ( 0.19%) 1,952,570,279 ( 0.19%) 357,818,661 ( 0.32%) seconds elapsed 20.630411534 96.084546565 12.743373571 seconds elapsed 22.292627625 100.984155001 14.407413560 seconds elapsed 22.396016966 110.344880848 14.032201392 seconds elapsed 22.517330949 113.351459170 14.243074935 seconds elapsed 28.548305104 156.515193765 19.159286861 seconds elapsed 30.453538116 164.559937678 19.362492717 seconds elapsed 33.467108086 188.486827481 21.492612173 seconds elapsed 35.617727591 209.602677783 23.256422492 seconds elapsed 42.584239509 243.959902566 28.458540338 seconds elapsed 47.683632526 269.635248851 31.542404137 Over all, ZSTD has slower WRITE, but much faster READ (perhaps a static compression buffer used during the test helped ZSTD a lot), which results in faster test results. Memory consumption (zram mm_stat file): zram LZO mm_stat mm_stat (jobs1): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs2): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs3): 2147483648 23068672 33558528 0 33562624 0 0 mm_stat (jobs4): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs5): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs6): 2147483648 23068672 33558528 0 33562624 0 0 mm_stat (jobs7): 2147483648 23068672 33558528 0 33566720 0 0 mm_stat (jobs8): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs9): 2147483648 23068672 33558528 0 33558528 0 0 mm_stat (jobs10): 2147483648 23068672 33558528 0 33562624 0 0 zram DEFLATE mm_stat mm_stat (jobs1): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs2): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs3): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs4): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs5): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs6): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs7): 2147483648 16252928 25178112 0 25190400 0 0 mm_stat (jobs8): 2147483648 16252928 25178112 0 25190400 0 0 mm_stat (jobs9): 2147483648 16252928 25178112 0 25178112 0 0 mm_stat (jobs10): 2147483648 16252928 25178112 0 25178112 0 0 zram ZSTD mm_stat mm_stat (jobs1): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs2): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs3): 2147483648 11010048 16781312 0 16785408 0 0 mm_stat (jobs4): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs5): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs6): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs7): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs8): 2147483648 11010048 16781312 0 16781312 0 0 mm_stat (jobs9): 2147483648 11010048 16781312 0 16785408 0 0 mm_stat (jobs10): 2147483648 11010048 16781312 0 16781312 0 0 ================================================================================== Official benchmarks [1]: Compressor name Ratio Compression Decompress. zstd 1.1.3 -1 2.877 430 MB/s 1110 MB/s zlib 1.2.8 -1 2.743 110 MB/s 400 MB/s brotli 0.5.2 -0 2.708 400 MB/s 430 MB/s quicklz 1.5.0 -1 2.238 550 MB/s 710 MB/s lzo1x 2.09 -1 2.108 650 MB/s 830 MB/s lz4 1.7.5 2.101 720 MB/s 3600 MB/s snappy 1.1.3 2.091 500 MB/s 1650 MB/s lzf 3.6 -1 2.077 400 MB/s 860 MB/s Minchan said: : I did test with my sample data and compared zstd with deflate. zstd's : compress ratio is lower a little bit but compression speed is much faster : 3 times more and decompress speed is too 2 times more. With different : data, it is different but overall, zstd would be better for speed at the : cost of a little lower compress ratio(about 5%) so I believe it's worth to : replace deflate. [1] https://github.com/facebook/zstd Link: http://lkml.kernel.org/r/20170912050005.3247-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Tested-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index 5b8992beffec..cc66daec7bbc 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -31,6 +31,9 @@ static const char * const backends[] = { #endif #if IS_ENABLED(CONFIG_CRYPTO_842) "842", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_ZSTD) + "zstd", #endif NULL }; From 0b07ff3972061a5e06e168378dd578801ae3e88e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2017 17:33:53 -0800 Subject: [PATCH 043/131] zram: remove zlib from the list of recommended algorithms ZSTD tends to outperform deflate/inflate, thus we remove zlib from the list of recommended algorithms and recommend zstd instead. Link: http://lkml.kernel.org/r/20170912050005.3247-2-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Suggested-by: Minchan Kim Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zcomp.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index cc66daec7bbc..4ed0a78fdc09 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -23,9 +23,6 @@ static const char * const backends[] = { #if IS_ENABLED(CONFIG_CRYPTO_LZ4) "lz4", #endif -#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) - "deflate", -#endif #if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) "lz4hc", #endif From 007ab7b49acb93af650f3acec0e51d2c6b843934 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Wed, 15 Nov 2017 17:33:56 -0800 Subject: [PATCH 044/131] fs/hugetlbfs/inode.c: remove redundant -ENIVAL return from hugetlbfs_setattr() There is no need to have a local return code set with -EINVAL when both the conditions following it return error codes appropriately. Just remove the redundant one. Link: http://lkml.kernel.org/r/20170929145444.17611-1-khandual@linux.vnet.ibm.com Signed-off-by: Anshuman Khandual Reviewed-by: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index ed113ea17aff..6adb17bb1a38 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -668,7 +668,6 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - error = -EINVAL; if (attr->ia_size & ~huge_page_mask(h)) return -EINVAL; error = hugetlb_vmtruncate(inode, attr->ia_size); From 0bea803e9e6bf3836d5368a6426c30a8c0e5eab5 Mon Sep 17 00:00:00 2001 From: Ralph Campbell Date: Wed, 15 Nov 2017 17:34:00 -0800 Subject: [PATCH 045/131] mm/hmm: constify hmm_devmem_page_get_drvdata() parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Constify pointer parameter to avoid issue when use from code that only has const struct page pointer to use in the first place. Link: http://lkml.kernel.org/r/1506972774-10191-1-git-send-email-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 96e69979f84d..325017ad9311 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -471,9 +471,9 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, * @page: pointer to struct page * Return: driver data value */ -static inline unsigned long hmm_devmem_page_get_drvdata(struct page *page) +static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; + const unsigned long *drvdata = (const unsigned long *)&page->pgmap; return drvdata[1]; } From 1aedcafbf32b3f232c159b14cd0d423fcfe2b861 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 15 Nov 2017 17:34:03 -0800 Subject: [PATCH 046/131] zsmalloc: calling zs_map_object() from irq is a bug Use BUG_ON(in_interrupt()) in zs_map_object(). This is not a new BUG_ON(), it's always been there, but was recently changed to VM_BUG_ON(). There are several problems there. First, we use use per-CPU mappings both in zsmalloc and in zram, and interrupt may easily corrupt those buffers. Second, and more importantly, we believe it's possible to start leaking sensitive information. Consider the following case: -> process P swap out zram per-cpu mapping CPU1 compress page A -> IRQ swap out zram per-cpu mapping CPU1 compress page B write page from per-cpu mapping CPU1 to zsmalloc pool iret -> process P write page from per-cpu mapping CPU1 to zsmalloc pool [*] return * so we store overwritten data that actually belongs to another page (task) and potentially contains sensitive data. And when process P will page fault it's going to read (swap in) that other task's data. Link: http://lkml.kernel.org/r/20170929045140.4055-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Acked-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 7c38e850a8fc..685049a9048d 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1349,7 +1349,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, * pools/users, we can't allow mapping in interrupt context * because it can corrupt another users mappings. */ - WARN_ON_ONCE(in_interrupt()); + BUG_ON(in_interrupt()); /* From now on, migration cannot move the object */ pin_tag(handle); From 0f10851ea475e08896ee5d9a2036d1bb46a8f3a4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 15 Nov 2017 17:34:07 -0800 Subject: [PATCH 047/131] mm/mmu_notifier: avoid double notification when it is useless MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch only affects users of mmu_notifier->invalidate_range callback which are device drivers related to ATS/PASID, CAPI, IOMMUv2, SVM ... and it is an optimization for those users. Everyone else is unaffected by it. When clearing a pte/pmd we are given a choice to notify the event under the page table lock (notify version of *_clear_flush helpers do call the mmu_notifier_invalidate_range). But that notification is not necessary in all cases. This patch removes almost all cases where it is useless to have a call to mmu_notifier_invalidate_range before mmu_notifier_invalidate_range_end. It also adds documentation in all those cases explaining why. Below is a more in depth analysis of why this is fine to do this: For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a process virtual address space). There is only 2 cases when you need to notify those secondary TLB while holding page table lock when clearing a pte/pmd: A) page backing address is free before mmu_notifier_invalidate_range_end B) a page table entry is updated to point to a new page (COW, write fault on zero page, __replace_page(), ...) Case A is obvious you do not want to take the risk for the device to write to a page that might now be used by something completely different. Case B is more subtle. For correctness it requires the following sequence to happen: - take page table lock - clear page table entry and notify (pmd/pte_huge_clear_flush_notify()) - set page table entry to point to new page If clearing the page table entry is not followed by a notify before setting the new pte/pmd value then you can break memory model like C11 or C++11 for the device. Consider the following scenario (device use a feature similar to ATS/ PASID): Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume they are write protected for COW (other case of B apply too). [Time N] ----------------------------------------------------------------- CPU-thread-0 {try to write to addrA} CPU-thread-1 {try to write to addrB} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {read addrA and populate device TLB} DEV-thread-2 {read addrB and populate device TLB} [Time N+1] --------------------------------------------------------------- CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+2] --------------------------------------------------------------- CPU-thread-0 {COW_step1: {update page table point to new page for addrA}} CPU-thread-1 {COW_step1: {update page table point to new page for addrB}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+3] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {preempted} CPU-thread-2 {write to addrA which is a write to new page} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+3] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {preempted} CPU-thread-2 {} CPU-thread-3 {write to addrB which is a write to new page} DEV-thread-0 {} DEV-thread-2 {} [Time N+4] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {} DEV-thread-2 {} [Time N+5] --------------------------------------------------------------- CPU-thread-0 {preempted} CPU-thread-1 {} CPU-thread-2 {} CPU-thread-3 {} DEV-thread-0 {read addrA from old page} DEV-thread-2 {read addrB from new page} So here because at time N+2 the clear page table entry was not pair with a notification to invalidate the secondary TLB, the device see the new value for addrB before seing the new value for addrA. This break total memory ordering for the device. When changing a pte to write protect or to point to a new write protected page with same content (KSM) it is ok to delay invalidate_range callback to mmu_notifier_invalidate_range_end() outside the page table lock. This is true even if the thread doing page table update is preempted right after releasing page table lock before calling mmu_notifier_invalidate_range_end Thanks to Andrea for thinking of a problematic scenario for COW. [jglisse@redhat.com: v2] Link: http://lkml.kernel.org/r/20171017031003.7481-2-jglisse@redhat.com Link: http://lkml.kernel.org/r/20170901173011.10745-1-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: David Woodhouse Cc: Alistair Popple Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: Andrew Donnellan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/mmu_notifier.txt | 93 +++++++++++++++++++++++++++++++ fs/dax.c | 9 ++- include/linux/mmu_notifier.h | 3 +- mm/huge_memory.c | 20 ++++++- mm/hugetlb.c | 16 +++++- mm/ksm.c | 15 ++++- mm/rmap.c | 59 ++++++++++++++++++-- 7 files changed, 198 insertions(+), 17 deletions(-) create mode 100644 Documentation/vm/mmu_notifier.txt diff --git a/Documentation/vm/mmu_notifier.txt b/Documentation/vm/mmu_notifier.txt new file mode 100644 index 000000000000..23b462566bb7 --- /dev/null +++ b/Documentation/vm/mmu_notifier.txt @@ -0,0 +1,93 @@ +When do you need to notify inside page table lock ? + +When clearing a pte/pmd we are given a choice to notify the event through +(notify version of *_clear_flush call mmu_notifier_invalidate_range) under +the page table lock. But that notification is not necessary in all cases. + +For secondary TLB (non CPU TLB) like IOMMU TLB or device TLB (when device use +thing like ATS/PASID to get the IOMMU to walk the CPU page table to access a +process virtual address space). There is only 2 cases when you need to notify +those secondary TLB while holding page table lock when clearing a pte/pmd: + + A) page backing address is free before mmu_notifier_invalidate_range_end() + B) a page table entry is updated to point to a new page (COW, write fault + on zero page, __replace_page(), ...) + +Case A is obvious you do not want to take the risk for the device to write to +a page that might now be used by some completely different task. + +Case B is more subtle. For correctness it requires the following sequence to +happen: + - take page table lock + - clear page table entry and notify ([pmd/pte]p_huge_clear_flush_notify()) + - set page table entry to point to new page + +If clearing the page table entry is not followed by a notify before setting +the new pte/pmd value then you can break memory model like C11 or C++11 for +the device. + +Consider the following scenario (device use a feature similar to ATS/PASID): + +Two address addrA and addrB such that |addrA - addrB| >= PAGE_SIZE we assume +they are write protected for COW (other case of B apply too). + +[Time N] -------------------------------------------------------------------- +CPU-thread-0 {try to write to addrA} +CPU-thread-1 {try to write to addrB} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {read addrA and populate device TLB} +DEV-thread-2 {read addrB and populate device TLB} +[Time N+1] ------------------------------------------------------------------ +CPU-thread-0 {COW_step0: {mmu_notifier_invalidate_range_start(addrA)}} +CPU-thread-1 {COW_step0: {mmu_notifier_invalidate_range_start(addrB)}} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+2] ------------------------------------------------------------------ +CPU-thread-0 {COW_step1: {update page table to point to new page for addrA}} +CPU-thread-1 {COW_step1: {update page table to point to new page for addrB}} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+3] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {preempted} +CPU-thread-2 {write to addrA which is a write to new page} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+3] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {preempted} +CPU-thread-2 {} +CPU-thread-3 {write to addrB which is a write to new page} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+4] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {COW_step3: {mmu_notifier_invalidate_range_end(addrB)}} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {} +DEV-thread-2 {} +[Time N+5] ------------------------------------------------------------------ +CPU-thread-0 {preempted} +CPU-thread-1 {} +CPU-thread-2 {} +CPU-thread-3 {} +DEV-thread-0 {read addrA from old page} +DEV-thread-2 {read addrB from new page} + +So here because at time N+2 the clear page table entry was not pair with a +notification to invalidate the secondary TLB, the device see the new value for +addrB before seing the new value for addrA. This break total memory ordering +for the device. + +When changing a pte to write protect or to point to a new write protected page +with same content (KSM) it is fine to delay the mmu_notifier_invalidate_range +call to mmu_notifier_invalidate_range_end() outside the page table lock. This +is true even if the thread doing the page table update is preempted right after +releasing page table lock but before call mmu_notifier_invalidate_range_end(). diff --git a/fs/dax.c b/fs/dax.c index f3a44a7c14b3..9ec797424e4f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -614,6 +614,13 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, if (follow_pte_pmd(vma->vm_mm, address, &start, &end, &ptep, &pmdp, &ptl)) continue; + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ if (pmdp) { #ifdef CONFIG_FS_DAX_PMD pmd_t pmd; @@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_wrprotect(pmd); pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pmd: spin_unlock(ptl); #endif @@ -643,7 +649,6 @@ unlock_pmd: pte = pte_wrprotect(pte); pte = pte_mkclean(pte); set_pte_at(vma->vm_mm, address, ptep, pte); - mmu_notifier_invalidate_range(vma->vm_mm, start, end); unlock_pte: pte_unmap_unlock(ptep, ptl); } diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 2cf1c3c807f6..130831718e95 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -156,7 +156,8 @@ struct mmu_notifier_ops { * shared page-tables, it not necessary to implement the * invalidate_range_start()/end() notifiers, as * invalidate_range() alread catches the points in time when an - * external TLB range needs to be flushed. + * external TLB range needs to be flushed. For more in depth + * discussion on this see Documentation/vm/mmu_notifier.txt * * The invalidate_range() function is called under the ptl * spin-lock and not allowed to sleep. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 003f7bcd0952..07ae73f4ef91 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1189,8 +1189,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); + /* + * Leave pmd empty until pte is filled note we must notify here as + * concurrent CPU thread might write to new page before the call to + * mmu_notifier_invalidate_range_end() happens which can lead to a + * device seeing memory write in different order than CPU. + * + * See Documentation/vm/mmu_notifier.txt + */ pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); - /* leave pmd empty until pte is filled */ pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); @@ -2029,8 +2036,15 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmd_t _pmd; int i; - /* leave pmd empty until pte is filled */ - pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * Leave pmd empty until pte is filled note that it is fine to delay + * notification until mmu_notifier_invalidate_range_end() as we are + * replacing a zero pmd write protected page with a zero pte write + * protected page. + * + * See Documentation/vm/mmu_notifier.txt + */ + pmdp_huge_clear_flush(vma, haddr, pmd); pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2d2ff5e8bf2b..681b300185c0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3256,9 +3256,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz); } else { if (cow) { + /* + * No need to notify as we are downgrading page + * table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ huge_ptep_set_wrprotect(src, addr, src_pte); - mmu_notifier_invalidate_range(src, mmun_start, - mmun_end); } entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); @@ -4318,7 +4323,12 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * and that page table be reused and filled with junk. */ flush_hugetlb_tlb_range(vma, start, end); - mmu_notifier_invalidate_range(mm, start, end); + /* + * No need to call mmu_notifier_invalidate_range() we are downgrading + * page table protection not changing it to point to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ i_mmap_unlock_write(vma->vm_file->f_mapping); mmu_notifier_invalidate_range_end(mm, start, end); diff --git a/mm/ksm.c b/mm/ksm.c index 6cb60f46cce5..be8f4576f842 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, * So we clear the pte and flush the tlb before the check * this assure us that no O_DIRECT can happen after the check * or in the middle of the check. + * + * No need to notify as we are downgrading page table to read + * only not changing it to point to a new page. + * + * See Documentation/vm/mmu_notifier.txt */ - entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); + entry = ptep_clear_flush(vma, pvmw.address, pvmw.pte); /* * Check that no O_DIRECT or similar I/O is in progress on the * page @@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, } flush_cache_page(vma, addr, pte_pfn(*ptep)); - ptep_clear_flush_notify(vma, addr, ptep); + /* + * No need to notify as we are replacing a read only page with another + * read only page with the same content. + * + * See Documentation/vm/mmu_notifier.txt + */ + ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); page_remove_rmap(page, false); diff --git a/mm/rmap.c b/mm/rmap.c index b874c4761e84..7dfc0975de4b 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -939,10 +939,15 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, #endif } - if (ret) { - mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend); + /* + * No need to call mmu_notifier_invalidate_range() as we are + * downgrading page table protection not changing it to point + * to a new page. + * + * See Documentation/vm/mmu_notifier.txt + */ + if (ret) (*cleaned)++; - } } mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); @@ -1426,6 +1431,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ goto discard; } @@ -1483,6 +1492,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * will take care of the rest. */ dec_mm_counter(mm, mm_counter(page)); + /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); } else if (IS_ENABLED(CONFIG_MIGRATION) && (flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) { swp_entry_t entry; @@ -1498,6 +1510,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); + /* + * No need to invalidate here it will synchronize on + * against the special swap migration pte. + */ } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(subpage) }; pte_t swp_pte; @@ -1509,6 +1525,8 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, WARN_ON_ONCE(1); ret = false; /* We have to invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); page_vma_mapped_walk_done(&pvmw); break; } @@ -1516,6 +1534,9 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* MADV_FREE page check */ if (!PageSwapBacked(page)) { if (!PageDirty(page)) { + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, + address, address + PAGE_SIZE); dec_mm_counter(mm, MM_ANONPAGES); goto discard; } @@ -1549,13 +1570,39 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (pte_soft_dirty(pteval)) swp_pte = pte_swp_mksoft_dirty(swp_pte); set_pte_at(mm, address, pvmw.pte, swp_pte); - } else + /* Invalidate as we cleared the pte */ + mmu_notifier_invalidate_range(mm, address, + address + PAGE_SIZE); + } else { + /* + * We should not need to notify here as we reach this + * case only from freeze_page() itself only call from + * split_huge_page_to_list() so everything below must + * be true: + * - page is not anonymous + * - page is locked + * + * So as it is a locked file back page thus it can not + * be remove from the page cache and replace by a new + * page before mmu_notifier_invalidate_range_end so no + * concurrent thread might update its page table to + * point at new page while a device still is using this + * page. + * + * See Documentation/vm/mmu_notifier.txt + */ dec_mm_counter(mm, mm_counter_file(page)); + } discard: + /* + * No need to call mmu_notifier_invalidate_range() it has be + * done above for all cases requiring it to happen under page + * table lock before mmu_notifier_invalidate_range_end() + * + * See Documentation/vm/mmu_notifier.txt + */ page_remove_rmap(subpage, PageHuge(page)); put_page(page); - mmu_notifier_invalidate_range(mm, address, - address + PAGE_SIZE); } mmu_notifier_invalidate_range_end(vma->vm_mm, start, end); From 4645b9fe84bf4878f04c7959a75df7c3c2d1bbb9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=A9r=C3=B4me=20Glisse?= Date: Wed, 15 Nov 2017 17:34:11 -0800 Subject: [PATCH 048/131] mm/mmu_notifier: avoid call to invalidate_range() in range_end() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This is an optimization patch that only affect mmu_notifier users which rely on the invalidate_range() callback. This patch avoids calling that callback twice in a row from inside __mmu_notifier_invalidate_range_end Existing pattern (before this patch): mmu_notifier_invalidate_range_start() pte/pmd/pud_clear_flush_notify() mmu_notifier_invalidate_range() mmu_notifier_invalidate_range_end() mmu_notifier_invalidate_range() New pattern (after this patch): mmu_notifier_invalidate_range_start() pte/pmd/pud_clear_flush_notify() mmu_notifier_invalidate_range() mmu_notifier_invalidate_range_only_end() We call the invalidate_range callback after clearing the page table under the page table lock and we skip the call to invalidate_range inside the __mmu_notifier_invalidate_range_end() function. Idea from Andrea Arcangeli Link: http://lkml.kernel.org/r/20171017031003.7481-3-jglisse@redhat.com Signed-off-by: Jérôme Glisse Cc: Andrea Arcangeli Cc: Joerg Roedel Cc: Suravee Suthikulpanit Cc: David Woodhouse Cc: Alistair Popple Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Stephen Rothwell Cc: Andrew Donnellan Cc: Nadav Amit Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmu_notifier.h | 17 +++++++++++-- mm/huge_memory.c | 46 ++++++++++++++++++++++++++++++++---- mm/memory.c | 6 ++++- mm/migrate.c | 15 +++++++++--- mm/mmu_notifier.c | 11 +++++++-- 5 files changed, 83 insertions(+), 12 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 130831718e95..b25dc9db19fc 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -214,7 +214,8 @@ extern void __mmu_notifier_change_pte(struct mm_struct *mm, extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); @@ -268,7 +269,14 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_end(mm, start, end); + __mmu_notifier_invalidate_range_end(mm, start, end, false); +} + +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + __mmu_notifier_invalidate_range_end(mm, start, end, true); } static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, @@ -439,6 +447,11 @@ static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, { } +static inline void mmu_notifier_invalidate_range_only_end(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ +} + static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 07ae73f4ef91..cc65fb87c9db 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1223,7 +1223,12 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, page_remove_rmap(page, true); spin_unlock(vmf->ptl); - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); ret |= VM_FAULT_WRITE; put_page(page); @@ -1372,7 +1377,12 @@ alloc: } spin_unlock(vmf->ptl); out_mn: - mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(vma->vm_mm, mmun_start, + mmun_end); out: return ret; out_unlock: @@ -2024,7 +2034,12 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PUD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pudp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PUD_SIZE); } #endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */ @@ -2099,6 +2114,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, add_mm_counter(mm, MM_FILEPAGES, -HPAGE_PMD_NR); return; } else if (is_huge_zero_pmd(*pmd)) { + /* + * FIXME: Do we want to invalidate secondary mmu by calling + * mmu_notifier_invalidate_range() see comments below inside + * __split_huge_pmd() ? + * + * We are going from a zero huge page write protected to zero + * small page also write protected so it does not seems useful + * to invalidate secondary mmu at this time. + */ return __split_huge_zero_page_pmd(vma, haddr, pmd); } @@ -2234,7 +2258,21 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, __split_huge_pmd_locked(vma, pmd, haddr, freeze); out: spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, haddr, haddr + HPAGE_PMD_SIZE); + /* + * No need to double call mmu_notifier->invalidate_range() callback. + * They are 3 cases to consider inside __split_huge_pmd_locked(): + * 1) pmdp_huge_clear_flush_notify() call invalidate_range() obvious + * 2) __split_huge_zero_page_pmd() read only zero page and any write + * fault will trigger a flush_notify before pointing to a new page + * (it is fine if the secondary mmu keeps pointing to the old zero + * page in the meantime) + * 3) Split a huge pmd into pte pointing to the same page. No need + * to invalidate secondary tlb entry they are all still valid. + * any further changes to individual pte will notify. So no need + * to call mmu_notifier->invalidate_range() + */ + mmu_notifier_invalidate_range_only_end(mm, haddr, haddr + + HPAGE_PMD_SIZE); } void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index a4518aedf4dd..42fb30300bb5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2554,7 +2554,11 @@ static int wp_page_copy(struct vm_fault *vmf) put_page(new_page); pte_unmap_unlock(vmf->pte, vmf->ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); if (old_page) { /* * Don't let another task, with possibly unlocked vma, diff --git a/mm/migrate.c b/mm/migrate.c index 1236449b4777..4d0be47a322a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2089,7 +2089,11 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, set_page_owner_migrate_reason(new_page, MR_NUMA_MISPLACED); spin_unlock(ptl); - mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above pmdp_huge_clear_flush_notify() did already call it. + */ + mmu_notifier_invalidate_range_only_end(mm, mmun_start, mmun_end); /* Take an "isolate" reference and put new page on the LRU. */ get_page(new_page); @@ -2805,9 +2809,14 @@ static void migrate_vma_pages(struct migrate_vma *migrate) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } + /* + * No need to double call mmu_notifier->invalidate_range() callback as + * the above ptep_clear_flush_notify() inside migrate_vma_insert_page() + * did already call it. + */ if (notified) - mmu_notifier_invalidate_range_end(mm, mmu_start, - migrate->end); + mmu_notifier_invalidate_range_only_end(mm, mmu_start, + migrate->end); } /* diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 314285284e6e..96edb33fd09a 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -190,7 +190,9 @@ void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, + unsigned long end, + bool only_end) { struct mmu_notifier *mn; int id; @@ -204,8 +206,13 @@ void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, * subsystem registers either invalidate_range_start()/end() or * invalidate_range(), so this will be no additional overhead * (besides the pointer check). + * + * We skip call to invalidate_range() if we know it is safe ie + * call site use mmu_notifier_invalidate_range_only_end() which + * is safe to do when we know that a call to invalidate_range() + * already happen under page table lock. */ - if (mn->ops->invalidate_range) + if (!only_end && mn->ops->invalidate_range) mn->ops->invalidate_range(mn, mm, start, end); if (mn->ops->invalidate_range_end) mn->ops->invalidate_range_end(mn, mm, start, end); From 3a50d14d0df5776e002a8683a290c87eeac93a21 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:34:15 -0800 Subject: [PATCH 049/131] mm: remove unused pgdat->inactive_ratio Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list") 'pgdat->inactive_ratio' is not used, except for printing "node_inactive_ratio: 0" in /proc/zoneinfo output. Remove it. Link: http://lkml.kernel.org/r/20171003152611.27483-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 6 ------ mm/vmscan.c | 2 +- mm/vmstat.c | 6 ++---- 3 files changed, 3 insertions(+), 11 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index a507f43ad221..39ccaa9c428b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -712,12 +712,6 @@ typedef struct pglist_data { /* Fields commonly accessed by the page reclaim scanner */ struct lruvec lruvec; - /* - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on - * this node's LRU. Maintained by the pageout code. - */ - unsigned int inactive_ratio; - unsigned long flags; ZONE_PADDING(_pad2_) diff --git a/mm/vmscan.c b/mm/vmscan.c index 15b483ef6440..2852b8c5a917 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2082,7 +2082,7 @@ static void shrink_active_list(unsigned long nr_to_scan, * If that fails and refaulting is observed, the inactive list grows. * * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages - * on this LRU, maintained by the pageout code. A zone->inactive_ratio + * on this LRU, maintained by the pageout code. An inactive_ratio * of 3 means 3:1 or 25% of the pages are kept on the inactive list. * * total target max diff --git a/mm/vmstat.c b/mm/vmstat.c index 4bb13e72ac97..7d11554861e4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1564,11 +1564,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n node_unreclaimable: %u" - "\n start_pfn: %lu" - "\n node_inactive_ratio: %u", + "\n start_pfn: %lu", pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES, - zone->zone_start_pfn, - zone->zone_pgdat->inactive_ratio); + zone->zone_start_pfn); seq_putc(m, '\n'); } From a2e16731728a285bcfcece0feaaa8cf478d24022 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 15 Nov 2017 17:34:18 -0800 Subject: [PATCH 050/131] mm/swap_slots.c: fix race conditions in swap_slots cache init Memory allocations can happen before the swap_slots cache initialization is completed during cpu bring up. If we are low on memory, we could call get_swap_page() and access swap_slots_cache before it is fully initialized. Add a check in get_swap_page() for initialized swap_slots_cache to prevent this condition. Similar check already exists in free_swap_slot. Also annotate the checks to indicate the likely condition. We also added a memory barrier to make sure that the locks initialization are done before the assignment of cache->slots and cache->slots_ret pointers. This ensures the assumption that it is safe to acquire the slots cache locks and use the slots cache when the corresponding cache->slots or cache->slots_ret pointers are non null. [akpm@linux-foundation.org: tidy up comment] [akpm@linux-foundation.org: fix spello in comment] Link: http://lkml.kernel.org/r/65a9d0f133f63e66bba37b53b2fd0464b7cae771.1500677066.git.tim.c.chen@linux.intel.com Signed-off-by: Tim Chen Reported-by: Wenwei Tao Acked-by: Ying Huang Cc: Minchan Kim Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Michal Hocko Cc: Hillf Danton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_slots.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index d81cfc5a43d5..bebc19292018 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -149,6 +149,13 @@ static int alloc_swap_slot_cache(unsigned int cpu) cache->nr = 0; cache->cur = 0; cache->n_ret = 0; + /* + * We initialized alloc_lock and free_lock earlier. We use + * !cache->slots or !cache->slots_ret to know if it is safe to acquire + * the corresponding lock and use the cache. Memory barrier below + * ensures the assumption. + */ + mb(); cache->slots = slots; slots = NULL; cache->slots_ret = slots_ret; @@ -275,7 +282,7 @@ int free_swap_slot(swp_entry_t entry) struct swap_slots_cache *cache; cache = raw_cpu_ptr(&swp_slots); - if (use_swap_slot_cache && cache->slots_ret) { + if (likely(use_swap_slot_cache && cache->slots_ret)) { spin_lock_irq(&cache->free_lock); /* Swap slots cache may be deactivated before acquiring lock */ if (!use_swap_slot_cache || !cache->slots_ret) { @@ -326,7 +333,7 @@ swp_entry_t get_swap_page(struct page *page) */ cache = raw_cpu_ptr(&swp_slots); - if (check_cache_active()) { + if (likely(check_cache_active() && cache->slots)) { mutex_lock(&cache->alloc_lock); if (cache->slots) { repeat: From 8745808fda84c638e45cc860c8fb600bf4b0a2a6 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:34:22 -0800 Subject: [PATCH 051/131] mm, arch: remove empty_bad_page* empty_bad_page() and empty_bad_pte_table() seem to be relics from old days which is not used by any code for a long time. I have tried to find when exactly but this is not really all that straightforward due to many code movements - traces disappear around 2.4 times. Anyway no code really references neither empty_bad_page nor empty_bad_pte_table. We only allocate the storage which is not used by anybody so remove them. Link: http://lkml.kernel.org/r/20171004150045.30755-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Ralf Baechle Acked-by: Ingo Molnar Cc: Yoshinori Sato Cc: David Howells Cc: Rich Felker Cc: Jeff Dike Cc: Richard Weinberger Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/frv/mm/init.c | 14 -------------- arch/h8300/mm/init.c | 13 ------------- arch/mips/include/asm/pgtable-64.h | 8 +------- arch/mn10300/kernel/head.S | 8 -------- arch/sh/kernel/head_64.S | 8 -------- arch/um/kernel/mem.c | 3 --- include/linux/page-flags.h | 2 +- 7 files changed, 2 insertions(+), 54 deletions(-) diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 328f0a292316..cf464100e838 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -42,21 +42,9 @@ #undef DEBUG /* - * BAD_PAGE is the page that is used for page faults when linux - * is out-of-memory. Older versions of linux just did a - * do_exit(), but using this instead means there is less risk - * for a process dying in kernel mode, possibly leaving a inode - * unused etc.. - * - * BAD_PAGETABLE is the accompanying page-table: it is initialized - * to point to BAD_PAGE entries. - * * ZERO_PAGE is a special page that is used for zero-initialized * data and COW. */ -static unsigned long empty_bad_page_table; -static unsigned long empty_bad_page; - unsigned long empty_zero_page; EXPORT_SYMBOL(empty_zero_page); @@ -72,8 +60,6 @@ void __init paging_init(void) unsigned long zones_size[MAX_NR_ZONES] = {0, }; /* allocate some pages for kernel housekeeping tasks */ - empty_bad_page_table = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); - empty_bad_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); empty_zero_page = (unsigned long) alloc_bootmem_pages(PAGE_SIZE); memset((void *) empty_zero_page, 0, PAGE_SIZE); diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index eeead51bed2d..015287ac8ce8 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -40,20 +40,9 @@ #include /* - * BAD_PAGE is the page that is used for page faults when linux - * is out-of-memory. Older versions of linux just did a - * do_exit(), but using this instead means there is less risk - * for a process dying in kernel mode, possibly leaving a inode - * unused etc.. - * - * BAD_PAGETABLE is the accompanying page-table: it is initialized - * to point to BAD_PAGE entries. - * * ZERO_PAGE is a special page that is used for zero-initialized * data and COW. */ -static unsigned long empty_bad_page_table; -static unsigned long empty_bad_page; unsigned long empty_zero_page; /* @@ -78,8 +67,6 @@ void __init paging_init(void) * Initialize the bad page table and bad page to point * to a couple of allocated pages. */ - empty_bad_page_table = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); - empty_bad_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); empty_zero_page = (unsigned long)alloc_bootmem_pages(PAGE_SIZE); memset((void *)empty_zero_page, 0, PAGE_SIZE); diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 67fe6dc5211c..0036ea0c7173 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -31,12 +31,7 @@ * tables. Each page table is also a single 4K page, giving 512 (== * PTRS_PER_PTE) 8 byte ptes. Each pud entry is initialized to point to * invalid_pmd_table, each pmd entry is initialized to point to - * invalid_pte_table, each pte is initialized to 0. When memory is low, - * and a pmd table or a page table allocation fails, empty_bad_pmd_table - * and empty_bad_page_table is returned back to higher layer code, so - * that the failure is recognized later on. Linux does not seem to - * handle these failures very well though. The empty_bad_page_table has - * invalid pte entries in it, to force page faults. + * invalid_pte_table, each pte is initialized to 0. * * Kernel mappings: kernel mappings are held in the swapper_pg_table. * The layout is identical to userspace except it's indexed with the @@ -175,7 +170,6 @@ printk("%s:%d: bad pgd %016lx.\n", __FILE__, __LINE__, pgd_val(e)) extern pte_t invalid_pte_table[PTRS_PER_PTE]; -extern pte_t empty_bad_page_table[PTRS_PER_PTE]; #ifndef __PAGETABLE_PUD_FOLDED /* diff --git a/arch/mn10300/kernel/head.S b/arch/mn10300/kernel/head.S index 73e00fc78072..0b15f759e0d2 100644 --- a/arch/mn10300/kernel/head.S +++ b/arch/mn10300/kernel/head.S @@ -433,14 +433,6 @@ ENTRY(swapper_pg_dir) ENTRY(empty_zero_page) .space PAGE_SIZE - .balign PAGE_SIZE -ENTRY(empty_bad_page) - .space PAGE_SIZE - - .balign PAGE_SIZE -ENTRY(empty_bad_pte_table) - .space PAGE_SIZE - .balign PAGE_SIZE ENTRY(large_page_table) .space PAGE_SIZE diff --git a/arch/sh/kernel/head_64.S b/arch/sh/kernel/head_64.S index defd851abefa..cca491397a28 100644 --- a/arch/sh/kernel/head_64.S +++ b/arch/sh/kernel/head_64.S @@ -101,14 +101,6 @@ empty_zero_page: mmu_pdtp_cache: .space PAGE_SIZE, 0 - .global empty_bad_page -empty_bad_page: - .space PAGE_SIZE, 0 - - .global empty_bad_pte_table -empty_bad_pte_table: - .space PAGE_SIZE, 0 - .global fpu_in_use fpu_in_use: .quad 0 diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index e7437ec62710..3c0e470ea646 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -22,8 +22,6 @@ /* allocated in paging_init, zeroed in mem_init, and unchanged thereafter */ unsigned long *empty_zero_page = NULL; EXPORT_SYMBOL(empty_zero_page); -/* allocated in paging_init and unchanged thereafter */ -static unsigned long *empty_bad_page = NULL; /* * Initialized during boot, and readonly for initializing page tables @@ -146,7 +144,6 @@ void __init paging_init(void) int i; empty_zero_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); - empty_bad_page = (unsigned long *) alloc_bootmem_low_pages(PAGE_SIZE); for (i = 0; i < ARRAY_SIZE(zones_size); i++) zones_size[i] = 0; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 584b14c774c1..3ec44e27aa9d 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -18,7 +18,7 @@ * Various page->flags bits: * * PG_reserved is set for special pages, which can never be swapped out. Some - * of them might not even exist (eg empty_bad_page)... + * of them might not even exist... * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by From 5984af1082f3b115082178ed88c47033d43b924d Mon Sep 17 00:00:00 2001 From: Pintu Agarwal Date: Wed, 15 Nov 2017 17:34:26 -0800 Subject: [PATCH 052/131] mm/cma.c: change pr_info to pr_err for cma_alloc fail log It was observed that under cma_alloc fail log, pr_info was used instead of pr_err. This will lead to problems if printk debug level is set to below 7. In this case the cma_alloc failure log will not be captured in the log and it will be difficult to debug. Simply replace the pr_info with pr_err to capture failure log. Link: http://lkml.kernel.org/r/1507650633-4430-1-git-send-email-pintu.ping@gmail.com Signed-off-by: Pintu Agarwal Cc: Laura Abbott Cc: Greg Kroah-Hartman Cc: Jaewon Kim Cc: Doug Berger Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/cma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/cma.c b/mm/cma.c index 022e52bd8370..0607729abf3b 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -461,7 +461,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align, trace_cma_alloc(pfn, page, count, align); if (ret && !(gfp_mask & __GFP_NOWARN)) { - pr_info("%s: alloc failed, req-size: %zu pages, ret: %d\n", + pr_err("%s: alloc failed, req-size: %zu pages, ret: %d\n", __func__, count, ret); cma_debug_show_areas(cma); } From 6b4c54e3787bc03e810062bd257a3b05fd9c72d6 Mon Sep 17 00:00:00 2001 From: Ayush Mittal Date: Wed, 15 Nov 2017 17:34:30 -0800 Subject: [PATCH 053/131] mm/page_owner.c: reduce page_owner structure size Maximum page order can be at max 10 which can be accomodated in short data type(2 bytes). last_migrate_reason is defined as enum type whose values can be accomodated in short data type (2 bytes). Total structure size is currently 16 bytes but after changing structure size it goes to 12 bytes. Vlastimil said: "Looks like it works, so why not. Before: [ 0.001000] allocated 50331648 bytes of page_ext After: [ 0.001000] allocated 41943040 bytes of page_ext" Link: http://lkml.kernel.org/r/1507623917-37991-1-git-send-email-ayush.m@samsung.com Signed-off-by: Ayush Mittal Acked-by: Vlastimil Babka Cc: Vinayak Menon Cc: Amit Sahrawat Cc: Vaneet Narang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f44b95b9d1e..8592543a0f15 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -20,9 +20,9 @@ #define PAGE_OWNER_STACK_DEPTH (16) struct page_owner { - unsigned int order; + unsigned short order; + short last_migrate_reason; gfp_t gfp_mask; - int last_migrate_reason; depot_stack_handle_t handle; }; From 72b045aecdd856b083521f2a963705b4c2e59680 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:33 -0800 Subject: [PATCH 054/131] mm: implement find_get_pages_range_tag() Patch series "Ranged pagevec tagged lookup", v3. In this series I provide a ranged variant of pagevec_lookup_tag() and use it in places where it makes sense. This series removes some common code and it also has a potential for speeding up some operations similarly as for pagevec_lookup_range() (but for now I can think of only artificial cases where this happens). This patch (of 16): Implement a variant of find_get_pages_tag() that stops iterating at given index. Lots of users of this function (through pagevec_lookup()) actually want a range lookup and all of them are currently open-coding this. Also create corresponding pagevec_lookup_range_tag() function. Link: http://lkml.kernel.org/r/20171009151359.31984-2-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Cc: Chao Yu Cc: David Howells Cc: David Sterba Cc: Ilya Dryomov Cc: Jaegeuk Kim Cc: Ryusuke Konishi Cc: Steve French Cc: "Theodore Ts'o" Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 12 ++++++++++-- include/linux/pagevec.h | 11 +++++++++-- mm/filemap.c | 33 ++++++++++++++++++++++++--------- mm/swap.c | 9 +++++---- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e08b5339023c..956d6a80e126 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -366,8 +366,16 @@ static inline unsigned find_get_pages(struct address_space *mapping, } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages); +static inline unsigned find_get_pages_tag(struct address_space *mapping, + pgoff_t *index, int tag, unsigned int nr_pages, + struct page **pages) +{ + return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, + nr_pages, pages); +} unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, int tag, unsigned int nr_entries, struct page **entries, pgoff_t *indices); diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 2636c0c0f279..afc718f586f8 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -38,9 +38,16 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, return pagevec_lookup_range(pvec, mapping, start, (pgoff_t)-1); } -unsigned pagevec_lookup_tag(struct pagevec *pvec, +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages); +static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages); + unsigned nr_pages) +{ + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, + nr_pages); +} static inline void pagevec_init(struct pagevec *pvec, int cold) { diff --git a/mm/filemap.c b/mm/filemap.c index 594d73fef8b4..cf74d0dacc6a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1754,9 +1754,10 @@ repeat: EXPORT_SYMBOL(find_get_pages_contig); /** - * find_get_pages_tag - find and return pages that match @tag + * find_get_pages_range_tag - find and return pages in given range matching @tag * @mapping: the address_space to search * @index: the starting page index + * @end: The final page index (inclusive) * @tag: the tag index * @nr_pages: the maximum number of pages * @pages: where the resulting pages are placed @@ -1764,8 +1765,9 @@ EXPORT_SYMBOL(find_get_pages_contig); * Like find_get_pages, except we only return pages which are tagged with * @tag. We update @index to index the next page for the traversal. */ -unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, - int tag, unsigned int nr_pages, struct page **pages) +unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, + pgoff_t end, int tag, unsigned int nr_pages, + struct page **pages) { struct radix_tree_iter iter; void **slot; @@ -1778,6 +1780,9 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, *index, tag) { struct page *head, *page; + + if (iter.index > end) + break; repeat: page = radix_tree_deref_slot(slot); if (unlikely(!page)) @@ -1819,18 +1824,28 @@ repeat: } pages[ret] = page; - if (++ret == nr_pages) - break; + if (++ret == nr_pages) { + *index = pages[ret - 1]->index + 1; + goto out; + } } + /* + * We come here when we got at @end. We take care to not overflow the + * index @index as it confuses some of the callers. This breaks the + * iteration when there is page at index -1 but that is already broken + * anyway. + */ + if (end == (pgoff_t)-1) + *index = (pgoff_t)-1; + else + *index = end + 1; +out: rcu_read_unlock(); - if (ret) - *index = pages[ret - 1]->index + 1; - return ret; } -EXPORT_SYMBOL(find_get_pages_tag); +EXPORT_SYMBOL(find_get_pages_range_tag); /** * find_get_entries_tag - find and return entries that match @tag diff --git a/mm/swap.c b/mm/swap.c index a77d68f2c1b6..e1c74eb8a775 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -986,14 +986,15 @@ unsigned pagevec_lookup_range(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range); -unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, - pgoff_t *index, int tag, unsigned nr_pages) +unsigned pagevec_lookup_range_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned nr_pages) { - pvec->nr = find_get_pages_tag(mapping, index, tag, + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, nr_pages, pvec->pages); return pagevec_count(pvec); } -EXPORT_SYMBOL(pagevec_lookup_tag); +EXPORT_SYMBOL(pagevec_lookup_range_tag); /* * Perform any setup for the swap system From 4006f437f965a74a31ba407138d646fbd3f3a822 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:37 -0800 Subject: [PATCH 055/131] btrfs: use pagevec_lookup_range_tag() We want only pages from given range in btree_write_cache_pages() and extent_write_cache_pages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-3-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: David Sterba Reviewed-by: Daniel Jordan Cc: David Sterba Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index adbbc017191c..c18cf5d59521 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3814,8 +3814,8 @@ retry: if (wbc->sync_mode == WB_SYNC_ALL) tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE))) { unsigned i; scanned = 1; @@ -3825,11 +3825,6 @@ retry: if (!PagePrivate(page)) continue; - if (!wbc->range_cyclic && page->index > end) { - done = 1; - break; - } - spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -3961,8 +3956,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE))) { unsigned i; scanned = 1; @@ -3987,12 +3982,6 @@ retry: continue; } - if (!wbc->range_cyclic && page->index > end) { - done = 1; - unlock_page(page); - continue; - } - if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(page)) flush_fn(data); From 0ed75fc8d288f414051a94743563ac8c2cb62b0c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:41 -0800 Subject: [PATCH 056/131] ceph: use pagevec_lookup_range_tag() We want only pages from given range in ceph_writepages_start(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-4-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Reviewed-by: "Yan, Zheng" Cc: Ilya Dryomov Cc: "Yan, Zheng" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 4d622654bfbc..1af5cc899cb4 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -872,13 +872,10 @@ retry: get_more_pages: pvec_pages = min_t(unsigned, PAGEVEC_SIZE, max_pages - locked_pages); - if (end - index < (u64)(pvec_pages - 1)) - pvec_pages = (unsigned)(end - index) + 1; - - pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY, pvec_pages); - dout("pagevec_lookup_tag got %d\n", pvec_pages); + dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) { @@ -896,16 +893,6 @@ get_more_pages: unlock_page(page); continue; } - if (page->index > end) { - dout("end of range %p\n", page); - /* can't be range_cyclic (1st pass) because - * end == -1 in that case. */ - stop = true; - if (ceph_wbc.head_snapc) - done = true; - unlock_page(page); - break; - } if (strip_unit_end && (page->index > strip_unit_end)) { dout("end of strip unit %p\n", page); unlock_page(page); From dc7f3e868a45de3cfcd3c849ad32331765547b57 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:44 -0800 Subject: [PATCH 057/131] ext4: use pagevec_lookup_range_tag() We want only pages from given range in ext4_writepages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-5-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/inode.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2633150e41b9..3d0708c91c5a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2620,24 +2620,14 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) goto out; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) - goto out; - /* * Accumulated enough dirty pages? This doesn't apply * to WB_SYNC_ALL mode. For integrity sync we have to From 69c4f35d25e475c07d1e5c033bfb514db360a079 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:48 -0800 Subject: [PATCH 058/131] f2fs: use pagevec_lookup_range_tag() We want only pages from given range in f2fs_write_cache_pages(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-6-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Chao Yu Reviewed-by: Daniel Jordan Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/data.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 36b535207c88..17d2c2997ddd 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1669,8 +1669,8 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; @@ -1678,11 +1678,6 @@ retry: struct page *page = pvec.pages[i]; bool submitted = false; - if (page->index > end) { - done = 1; - break; - } - done_index = page->index; retry_write: lock_page(page); From 028a63a6e381d7d109e19db4740085d1b655c539 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:51 -0800 Subject: [PATCH 059/131] f2fs: simplify page iteration loops In several places we want to iterate over all tagged pages in a mapping. However the code was apparently copied from places that iterate only over a limited range and thus it checks for index <= end, optimizes the case where we are coming close to range end which is all pointless when end == ULONG_MAX. So just remove this dead code. [akpm@linux-foundation.org: fix warnings] Link: http://lkml.kernel.org/r/20171009151359.31984-7-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Reviewed-by: Chao Yu Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/checkpoint.c | 13 ++++------ fs/f2fs/node.c | 59 +++++++++++++++----------------------------- 2 files changed, 25 insertions(+), 47 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 04fe1df052b2..fb74ec4ed9d2 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -305,9 +305,10 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; + pgoff_t index = 0, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; + int nr_pages; struct writeback_control wbc = { .for_reclaim = 0, }; @@ -317,13 +318,9 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, blk_start_plug(&plug); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (unlikely(nr_pages == 0)) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index fca87835a1da..ca020e7d7465 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1277,21 +1277,17 @@ release_page: static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; struct page *last_page = NULL; + int nr_pages; pagevec_init(&pvec, 0); index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1425,13 +1421,14 @@ static int f2fs_write_node_page(struct page *page, int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { - pgoff_t index, end; + pgoff_t index; pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; + int nr_pages; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1441,15 +1438,10 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, retry: pagevec_init(&pvec, 0); index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1548,25 +1540,21 @@ out: int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type) { - pgoff_t index, end; + pgoff_t index; struct pagevec pvec; int step = 0; int nwritten = 0; int ret = 0; + int nr_pages; pagevec_init(&pvec, 0); next_step: index = 0; - end = ULONG_MAX; - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -1655,27 +1643,20 @@ out: int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = ULONG_MAX; + pgoff_t index = 0; struct pagevec pvec; int ret2, ret = 0; + int nr_pages; pagevec_init(&pvec, 0); - while (index <= end) { - int i, nr_pages; - nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); - if (nr_pages == 0) - break; + while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) { + int i; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (unlikely(page->index > end)) - continue; - if (ino && ino_of_node(page) == ino) { f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) From 8faab64229a5d5f5ab624cbe5d5286d5b74c76cd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:55 -0800 Subject: [PATCH 060/131] f2fs: use find_get_pages_tag() for looking up single page __get_first_dirty_index() wants to lookup only the first dirty page after given index. There's no point in using pagevec_lookup_tag() for that. Just use find_get_pages_tag() directly. Link: http://lkml.kernel.org/r/20171009151359.31984-8-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Chao Yu Reviewed-by: Daniel Jordan Cc: Jaegeuk Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/f2fs/file.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 517e112c8a9a..f78b76ec4707 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -313,18 +313,19 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { - struct pagevec pvec; + struct page *page; int nr_pages; if (whence != SEEK_DATA) return 0; /* find first dirty page index */ - pagevec_init(&pvec, 0); - nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, - PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; - pagevec_release(&pvec); + nr_pages = find_get_pages_tag(mapping, &pgofs, PAGECACHE_TAG_DIRTY, + 1, &page); + if (!nr_pages) + return ULONG_MAX; + pgofs = page->index; + put_page(page); return pgofs; } From d2bc5b3c67a9cce28616fd90e1651b9c9246fae8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:34:58 -0800 Subject: [PATCH 061/131] gfs2: use pagevec_lookup_range_tag() We want only pages from given range in gfs2_write_cache_jdata(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-9-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: Bob Peterson Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/gfs2/aops.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 68ed06962537..d0848d9623fb 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -280,22 +280,6 @@ static int gfs2_write_jdata_pagevec(struct address_space *mapping, for(i = 0; i < nr_pages; i++) { struct page *page = pvec->pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - ret = 1; - break; - } - *done_index = page->index; lock_page(page); @@ -413,8 +397,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; From 40f9c51326efd362dbae6c0e07f0c0ada42a5e00 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:02 -0800 Subject: [PATCH 062/131] nilfs2: use pagevec_lookup_range_tag() We want only pages from given range in nilfs_lookup_dirty_data_buffers(). Use pagevec_lookup_range_tag() instead of pagevec_lookup_tag() and remove unnecessary code. Link: http://lkml.kernel.org/r/20171009151359.31984-10-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Acked-by: Ryusuke Konishi Cc: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/segment.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 70ded52dc1dd..68e5769cef3b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -711,18 +711,14 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, pagevec_init(&pvec, 0); repeat: if (unlikely(index > last) || - !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - min_t(pgoff_t, last - index, - PAGEVEC_SIZE - 1) + 1)) + !pagevec_lookup_range_tag(&pvec, mapping, &index, last, + PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { struct buffer_head *bh, *head; struct page *page = pvec.pages[i]; - if (unlikely(page->index > last)) - break; - lock_page(page); if (!page_has_buffers(page)) create_empty_buffers(page, i_blocksize(inode), 0); From 312e9d2f7053f480627dcaf5e14f9cae78e3715a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:05 -0800 Subject: [PATCH 063/131] mm: use pagevec_lookup_range_tag() in __filemap_fdatawait_range() Use pagevec_lookup_range_tag() in __filemap_fdatawait_range() as it is interested only in pages from given range. Remove unnecessary code resulting from this. Link: http://lkml.kernel.org/r/20171009151359.31984-11-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index cf74d0dacc6a..229481d258bc 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -420,19 +420,17 @@ static void __filemap_fdatawait_range(struct address_space *mapping, return; pagevec_init(&pvec, 0); - while ((index <= end) && - (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + while (index <= end) { unsigned i; + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE); + if (!nr_pages) + break; + for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* until radix tree lookup accepts end_index */ - if (page->index > end) - continue; - wait_on_page_writeback(page); ClearPageError(page); } From 2b9775ae422fa46b4aee2bb2a8d2184a5a3b90e0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:09 -0800 Subject: [PATCH 064/131] mm: use pagevec_lookup_range_tag() in write_cache_pages() Use pagevec_lookup_range_tag() in write_cache_pages() as it is interested only in pages from given range. Remove unnecessary code resulting from this. Link: http://lkml.kernel.org/r/20171009151359.31984-12-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 768fe4e37e6a..460fc022cbc8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2196,30 +2196,14 @@ retry: while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, - min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, + tag, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - /* - * At this point, the page may be truncated or - * invalidated (changing page->mapping to NULL), or - * even swizzled back from swapper_space to tmpfs file - * mapping. However, page->index will not change - * because we have a reference on the page. - */ - if (page->index > end) { - /* - * can't be range_cyclic (1st pass) because - * end == -1 in that case. - */ - done = 1; - break; - } - done_index = page->index; lock_page(page); From 93d3b7140ad379885849ad2674b4290c9e8273da Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:12 -0800 Subject: [PATCH 065/131] mm: add variant of pagevec_lookup_range_tag() taking number of pages Currently pagevec_lookup_range_tag() takes number of pages to look up but most users don't need this. Create a new function pagevec_lookup_range_nr_tag() that takes maximum number of pages to lookup for Ceph which wants this functionality so that we can drop nr_pages argument from pagevec_lookup_range_tag(). Link: http://lkml.kernel.org/r/20171009151359.31984-13-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 3 +++ mm/swap.c | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index afc718f586f8..87a2dfd62f5e 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -41,6 +41,9 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned nr_pages); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, int tag, unsigned nr_pages) diff --git a/mm/swap.c b/mm/swap.c index e1c74eb8a775..6c50fec2da92 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -996,6 +996,15 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec, } EXPORT_SYMBOL(pagevec_lookup_range_tag); +unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, + struct address_space *mapping, pgoff_t *index, pgoff_t end, + int tag, unsigned max_pages) +{ + pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, + min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages); + return pagevec_count(pvec); +} +EXPORT_SYMBOL(pagevec_lookup_range_nr_tag); /* * Perform any setup for the swap system */ From 4be90299a1693c2112edb20ca78d6cc9f2183326 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:16 -0800 Subject: [PATCH 066/131] ceph: use pagevec_lookup_range_nr_tag() Use new function for looking up pages since nr_pages argument from pagevec_lookup_range_tag() is going away. Link: http://lkml.kernel.org/r/20171009151359.31984-14-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: "Yan, Zheng" Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ceph/addr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 1af5cc899cb4..0fc2987cb62d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -870,11 +870,9 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = min_t(unsigned, PAGEVEC_SIZE, - max_pages - locked_pages); - pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index, end, PAGECACHE_TAG_DIRTY, - pvec_pages); + max_pages - locked_pages); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; From 67fd707f468142d0f689a6240044bb45c1913003 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:19 -0800 Subject: [PATCH 067/131] mm: remove nr_pages argument from pagevec_lookup_{,range}_tag() All users of pagevec_lookup() and pagevec_lookup_range() now pass PAGEVEC_SIZE as a desired number of pages. Just drop the argument. Link: http://lkml.kernel.org/r/20171009151359.31984-15-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 6 +++--- fs/ceph/addr.c | 3 +-- fs/ext4/inode.c | 2 +- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 8 ++++---- fs/gfs2/aops.c | 2 +- fs/nilfs2/btree.c | 4 ++-- fs/nilfs2/page.c | 7 +++---- fs/nilfs2/segment.c | 6 +++--- include/linux/pagevec.h | 8 +++----- mm/filemap.c | 2 +- mm/page-writeback.c | 2 +- mm/swap.c | 4 ++-- 14 files changed, 27 insertions(+), 31 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c18cf5d59521..93fdaac81212 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3815,7 +3815,7 @@ retry: tag_pages_for_writeback(mapping, index, end); while (!done && !nr_to_write_done && (index <= end) && (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE))) { + tag))) { unsigned i; scanned = 1; @@ -3956,8 +3956,8 @@ retry: tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && !nr_to_write_done && (index <= end) && - (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE))) { + (nr_pages = pagevec_lookup_range_tag(&pvec, mapping, + &index, end, tag))) { unsigned i; scanned = 1; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 0fc2987cb62d..26c682db94ee 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1162,8 +1162,7 @@ release_pvec_pages: index = 0; while ((index <= end) && (nr = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_WRITEBACK, - PAGEVEC_SIZE))) { + PAGECACHE_TAG_WRITEBACK))) { for (i = 0; i < nr; i++) { page = pvec.pages[i]; if (page_snap_context(page) != snapc) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3d0708c91c5a..b239a41dbeeb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2621,7 +2621,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) mpd->next_page = index; while (index <= end) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) goto out; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fb74ec4ed9d2..6124f8710dc3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -319,7 +319,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, blk_start_plug(&plug); while ((nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 17d2c2997ddd..687703755824 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1670,7 +1670,7 @@ retry: int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ca020e7d7465..d6e4df0bb622 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1286,7 +1286,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1440,7 +1440,7 @@ retry: index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1553,7 +1553,7 @@ next_step: index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE))) { + PAGECACHE_TAG_DIRTY))) { int i; for (i = 0; i < nr_pages; i++) { @@ -1651,7 +1651,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) pagevec_init(&pvec, 0); while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, - PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE))) { + PAGECACHE_TAG_WRITEBACK))) { int i; for (i = 0; i < nr_pages; i++) { diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index d0848d9623fb..3fea3d7780b0 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -398,7 +398,7 @@ retry: done_index = index; while (!done && (index <= end)) { nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 06ffa135dfa6..35989c7bb065 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2158,8 +2158,8 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, btcache, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 8616c46d33da..1c16726915c1 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -257,8 +257,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, pagevec_init(&pvec, 0); repeat: - if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) + if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) return 0; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -376,8 +375,8 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 68e5769cef3b..19366ab20bea 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -712,7 +712,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, repeat: if (unlikely(index > last) || !pagevec_lookup_range_tag(&pvec, mapping, &index, last, - PAGECACHE_TAG_DIRTY, PAGEVEC_SIZE)) + PAGECACHE_TAG_DIRTY)) return ndirties; for (i = 0; i < pagevec_count(&pvec); i++) { @@ -755,8 +755,8 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, pagevec_init(&pvec, 0); - while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY, - PAGEVEC_SIZE)) { + while (pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY)) { for (i = 0; i < pagevec_count(&pvec); i++) { bh = head = page_buffers(pvec.pages[i]); do { diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 87a2dfd62f5e..4f56e0ad9d00 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -40,16 +40,14 @@ static inline unsigned pagevec_lookup(struct pagevec *pvec, unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages); + int tag); unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, int tag, unsigned max_pages); static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, - struct address_space *mapping, pgoff_t *index, int tag, - unsigned nr_pages) + struct address_space *mapping, pgoff_t *index, int tag) { - return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag, - nr_pages); + return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } static inline void pagevec_init(struct pagevec *pvec, int cold) diff --git a/mm/filemap.c b/mm/filemap.c index 229481d258bc..6eb4e32d99c8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -424,7 +424,7 @@ static void __filemap_fdatawait_range(struct address_space *mapping, unsigned i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_WRITEBACK, PAGEVEC_SIZE); + end, PAGECACHE_TAG_WRITEBACK); if (!nr_pages) break; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 460fc022cbc8..231651a1486d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2197,7 +2197,7 @@ retry: int i; nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, - tag, PAGEVEC_SIZE); + tag); if (nr_pages == 0) break; diff --git a/mm/swap.c b/mm/swap.c index 6c50fec2da92..4edac536fe24 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -988,10 +988,10 @@ EXPORT_SYMBOL(pagevec_lookup_range); unsigned pagevec_lookup_range_tag(struct pagevec *pvec, struct address_space *mapping, pgoff_t *index, pgoff_t end, - int tag, unsigned nr_pages) + int tag) { pvec->nr = find_get_pages_range_tag(mapping, index, end, tag, - nr_pages, pvec->pages); + PAGEVEC_SIZE, pvec->pages); return pagevec_count(pvec); } EXPORT_SYMBOL(pagevec_lookup_range_tag); From aef6e415eefada1d25ec7f7775e9a02bc20bf264 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:23 -0800 Subject: [PATCH 068/131] afs: use find_get_pages_range_tag() Use find_get_pages_range_tag() in afs_writepages_region() as we are interested only in pages from given range. Remove unnecessary code after this conversion. Link: http://lkml.kernel.org/r/20171009151359.31984-16-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Daniel Jordan Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/afs/write.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/afs/write.c b/fs/afs/write.c index 106e43db1115..d62a6b54152d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -497,20 +497,13 @@ static int afs_writepages_region(struct address_space *mapping, _enter(",,%lx,%lx,", index, end); do { - n = find_get_pages_tag(mapping, &index, PAGECACHE_TAG_DIRTY, - 1, &page); + n = find_get_pages_range_tag(mapping, &index, end, + PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; _debug("wback %lx", page->index); - if (page->index > end) { - *_next = index; - put_page(page); - _leave(" = 0 [%lx]", *_next); - return 0; - } - /* at this point we hold neither mapping->tree_lock nor lock on * the page itself: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled back from From 9c19a9cb1642c074aa8bc7693cd4c038643960ae Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:35:26 -0800 Subject: [PATCH 069/131] cifs: use find_get_pages_range_tag() wdata_alloc_and_fillpages() needlessly iterates calls to find_get_pages_tag(). Also it wants only pages from given range. Make it use find_get_pages_range_tag(). Link: http://lkml.kernel.org/r/20171009151359.31984-17-jack@suse.cz Signed-off-by: Jan Kara Suggested-by: Daniel Jordan Reviewed-by: Daniel Jordan Cc: Steve French Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/cifs/file.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 92fdf9c35de2..df9f682708c6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1963,8 +1963,6 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, pgoff_t end, pgoff_t *index, unsigned int *found_pages) { - unsigned int nr_pages; - struct page **pages; struct cifs_writedata *wdata; wdata = cifs_writedata_alloc((unsigned int)tofind, @@ -1972,23 +1970,8 @@ wdata_alloc_and_fillpages(pgoff_t tofind, struct address_space *mapping, if (!wdata) return NULL; - /* - * find_get_pages_tag seems to return a max of 256 on each - * iteration, so we must call it several times in order to - * fill the array or the wsize is effectively limited to - * 256 * PAGE_SIZE. - */ - *found_pages = 0; - pages = wdata->pages; - do { - nr_pages = find_get_pages_tag(mapping, index, - PAGECACHE_TAG_DIRTY, tofind, - pages); - *found_pages += nr_pages; - tofind -= nr_pages; - pages += nr_pages; - } while (nr_pages && tofind && *index <= end); - + *found_pages = find_get_pages_range_tag(mapping, index, end, + PAGECACHE_TAG_DIRTY, tofind, wdata->pages); return wdata; } From 7d6c4dfa4de96d11b9d6adaf5aa5ca8c54670258 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 15 Nov 2017 17:35:30 -0800 Subject: [PATCH 070/131] kmemleak: change /sys/kernel/debug/kmemleak permissions from 0444 to 0644 Kmemleak can be tweaked at runtime by writing commands into debugfs file. Root can use it anyway, but without the write-bit this interface isn't obvious. Link: http://lkml.kernel.org/r/150728996582.744328.11541332857988399411.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 7780cd83a495..fca3452e56c1 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -2104,7 +2104,7 @@ static int __init kmemleak_late_init(void) return -ENOMEM; } - dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, + dentry = debugfs_create_file("kmemleak", 0644, NULL, NULL, &kmemleak_fops); if (!dentry) pr_warn("Failed to create the debugfs kmemleak file\n"); From b4e98d9ac775907cc53fb08fcb6776deb7694e30 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:33 -0800 Subject: [PATCH 071/131] mm: account pud page tables On a machine with 5-level paging support a process can allocate significant amount of memory and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PUD page tables. We don't account PUD page tables, only PMD and PTE. We already addressed the same issue for PMD page tables, see commit dc6c9a35b66b ("mm: account pmd page tables to the process"). Introduction of 5-level paging brings the same issue for PUD page tables. The patch expands accounting to PUD level. [kirill.shutemov@linux.intel.com: s/pmd_t/pud_t/] Link: http://lkml.kernel.org/r/20171004074305.x35eh5u7ybbt5kar@black.fi.intel.com [heiko.carstens@de.ibm.com: s390/mm: fix pud table accounting] Link: http://lkml.kernel.org/r/20171103090551.18231-1-heiko.carstens@de.ibm.com Link: http://lkml.kernel.org/r/20171002080427.3320-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Signed-off-by: Heiko Carstens Acked-by: Rik van Riel Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 8 +++---- arch/powerpc/mm/hugetlbpage.c | 1 + arch/s390/include/asm/mmu_context.h | 4 +++- arch/sparc/mm/hugetlbpage.c | 1 + fs/proc/task_mmu.c | 5 +++- include/linux/mm.h | 36 ++++++++++++++++++++++++++--- include/linux/mm_types.h | 3 +++ kernel/fork.c | 4 ++++ mm/debug.c | 6 +++-- mm/memory.c | 15 +++++++----- mm/oom_kill.c | 8 ++++--- 11 files changed, 71 insertions(+), 20 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 30fd16b14196..2729e8db9492 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -629,10 +629,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj -score, and name. This is helpful to determine why the OOM killer was -invoked, to identify the rogue task that caused it, and to determine why -the OOM killer chose the task it did to kill. +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents, +oom_score_adj score, and name. This is helpful to determine why the OOM +killer was invoked, to identify the rogue task that caused it, and to +determine why the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index 1571a498a33f..a9b9083c5e49 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -433,6 +433,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } /* diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h index 43607bb12cc2..cf4c1cb17dcd 100644 --- a/arch/s390/include/asm/mmu_context.h +++ b/arch/s390/include/asm/mmu_context.h @@ -44,6 +44,8 @@ static inline int init_new_context(struct task_struct *tsk, mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_REGION3; + /* pgd_alloc() did not account this pud */ + mm_inc_nr_puds(mm); break; case -PAGE_SIZE: /* forked 5-level task, set new asce with new_mm->pgd */ @@ -59,7 +61,7 @@ static inline int init_new_context(struct task_struct *tsk, /* forked 2-level compat task, set new asce with new mm->pgd */ mm->context.asce = __pa(mm->pgd) | _ASCE_TABLE_LENGTH | _ASCE_USER_BITS | _ASCE_TYPE_SEGMENT; - /* pgd_alloc() did not increase mm->nr_pmds */ + /* pgd_alloc() did not account this pmd */ mm_inc_nr_pmds(mm); } crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5078b7f68890..01f63b4ee2b4 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -472,6 +472,7 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } void hugetlb_free_pgd_range(struct mmu_gather *tlb, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6744bd706ecf..a05ce6186f99 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -26,7 +26,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; + unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -52,6 +52,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) swap = get_mm_counter(mm, MM_SWAPENTS); ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); + puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -68,6 +69,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" "VmPMD:\t%8lu kB\n" + "VmPUD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -82,6 +84,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, + puds >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 91b46f99b4d2..9af86f39d928 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1599,14 +1599,44 @@ static inline int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address); #endif -#ifdef __PAGETABLE_PUD_FOLDED +#if defined(__PAGETABLE_PUD_FOLDED) || !defined(CONFIG_MMU) static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) { return 0; } + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_nr_puds_init(struct mm_struct *mm) {} +static inline void mm_inc_nr_puds(struct mm_struct *mm) {} +static inline void mm_dec_nr_puds(struct mm_struct *mm) {} + #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); + +static inline void mm_nr_puds_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_puds, 0); +} + +static inline unsigned long mm_nr_puds(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_puds); +} + +static inline void mm_inc_nr_puds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_puds); +} + +static inline void mm_dec_nr_puds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_puds); +} #endif #if defined(__PAGETABLE_PMD_FOLDED) || !defined(CONFIG_MMU) @@ -1618,7 +1648,7 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, static inline void mm_nr_pmds_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return 0; } @@ -1634,7 +1664,7 @@ static inline void mm_nr_pmds_init(struct mm_struct *mm) atomic_long_set(&mm->nr_pmds, 0); } -static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) { return atomic_long_read(&mm->nr_pmds); } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d1b8e8f97fc2..e9e561e02d22 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -404,6 +404,9 @@ struct mm_struct { atomic_long_t nr_ptes; /* PTE page table pages */ #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ +#endif +#if CONFIG_PGTABLE_LEVELS > 3 + atomic_long_t nr_puds; /* PUD page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/kernel/fork.c b/kernel/fork.c index 07cc743698d3..a4eb6f289365 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -819,6 +819,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); mm_nr_pmds_init(mm); + mm_nr_puds_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -878,6 +879,9 @@ static void check_mm(struct mm_struct *mm) if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); + if (mm_nr_puds(mm)) + pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", + mm_nr_puds(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); diff --git a/mm/debug.c b/mm/debug.c index 6726bec731c9..a12d826bb774 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,7 +105,8 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d\n" + "nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -136,7 +137,8 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), - mm_nr_pmds((struct mm_struct *)mm), + mm_nr_pmds(mm), + mm_nr_puds(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/memory.c b/mm/memory.c index 42fb30300bb5..6bbd4078ec98 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -506,6 +506,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); + mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -4149,15 +4150,17 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_5LEVEL_HACK - if (p4d_present(*p4d)) /* Another has populated it */ - pud_free(mm, new); - else + if (!p4d_present(*p4d)) { + mm_inc_nr_puds(mm); p4d_populate(mm, p4d, new); -#else - if (pgd_present(*p4d)) /* Another has populated it */ + } else /* Another has populated it */ pud_free(mm, new); - else +#else + if (!pgd_present(*p4d)) { + mm_inc_nr_puds(mm); pgd_populate(mm, p4d, new); + } else /* Another has populated it */ + pud_free(mm, new); #endif /* __ARCH_HAS_5LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3023919970f7..f642a45b7f14 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -221,7 +221,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) + + mm_nr_puds(p->mm); task_unlock(p); /* @@ -397,7 +398,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -413,11 +414,12 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), atomic_long_read(&task->mm->nr_ptes), mm_nr_pmds(task->mm), + mm_nr_puds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); From c4812909f5d5a9b7f1c85a2d95be388a066cda52 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:37 -0800 Subject: [PATCH 072/131] mm: introduce wrappers to access mm->nr_ptes Let's add wrappers for ->nr_ptes with the same interface as for nr_pmd and nr_pud. The patch also makes nr_ptes accounting dependent onto CONFIG_MMU. Page table accounting doesn't make sense if you don't have page tables. It's preparation for consolidation of page-table counters in mm_struct. Link: http://lkml.kernel.org/r/20171006100651.44742-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/unicore32/mm/pgd.c | 2 +- fs/proc/task_mmu.c | 2 +- include/linux/mm.h | 32 ++++++++++++++++++++++++++++++++ include/linux/mm_types.h | 2 ++ kernel/fork.c | 6 +++--- mm/debug.c | 2 +- mm/huge_memory.c | 10 +++++----- mm/khugepaged.c | 2 +- mm/memory.c | 8 ++++---- mm/oom_kill.c | 5 ++--- 12 files changed, 54 insertions(+), 21 deletions(-) diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index c1c1a5c67da1..61e281cb29fb 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -141,7 +141,7 @@ void pgd_free(struct mm_struct *mm, pgd_t *pgd_base) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); no_pmd: pud_clear(pud); pmd_free(mm, pmd); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 01f63b4ee2b4..0112d6942288 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -397,7 +397,7 @@ static void hugetlb_free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static void hugetlb_free_pmd_range(struct mmu_gather *tlb, pud_t *pud, diff --git a/arch/unicore32/mm/pgd.c b/arch/unicore32/mm/pgd.c index c572a28c76c9..a830a300aaa1 100644 --- a/arch/unicore32/mm/pgd.c +++ b/arch/unicore32/mm/pgd.c @@ -97,7 +97,7 @@ void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd) pte = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free(mm, pte); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); pmd_free(mm, pmd); mm_dec_nr_pmds(mm); free: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a05ce6186f99..9bd2a0294ac1 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -50,7 +50,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm); pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, diff --git a/include/linux/mm.h b/include/linux/mm.h index 9af86f39d928..2ca799f0d762 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1680,6 +1680,38 @@ static inline void mm_dec_nr_pmds(struct mm_struct *mm) } #endif +#ifdef CONFIG_MMU +static inline void mm_nr_ptes_init(struct mm_struct *mm) +{ + atomic_long_set(&mm->nr_ptes, 0); +} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_ptes); +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_ptes); +} + +static inline void mm_dec_nr_ptes(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_ptes); +} +#else +static inline void mm_nr_ptes_init(struct mm_struct *mm) {} + +static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_ptes(struct mm_struct *mm) {} +static inline void mm_dec_nr_ptes(struct mm_struct *mm) {} +#endif + int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e9e561e02d22..e42048020664 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -401,7 +401,9 @@ struct mm_struct { */ atomic_t mm_count; +#ifdef CONFIG_MMU atomic_long_t nr_ptes; /* PTE page table pages */ +#endif #if CONFIG_PGTABLE_LEVELS > 2 atomic_long_t nr_pmds; /* PMD page table pages */ #endif diff --git a/kernel/fork.c b/kernel/fork.c index a4eb6f289365..946922a30ede 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,7 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - atomic_long_set(&mm->nr_ptes, 0); + mm_nr_ptes_init(mm); mm_nr_pmds_init(mm); mm_nr_puds_init(mm); mm->map_count = 0; @@ -873,9 +873,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (atomic_long_read(&mm->nr_ptes)) + if (mm_nr_ptes(mm)) pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - atomic_long_read(&mm->nr_ptes)); + mm_nr_ptes(mm)); if (mm_nr_pmds(mm)) pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", mm_nr_pmds(mm)); diff --git a/mm/debug.c b/mm/debug.c index a12d826bb774..c9888a6d7875 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -136,7 +136,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_ptes(mm), mm_nr_pmds(mm), mm_nr_puds(mm), mm->map_count, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cc65fb87c9db..3610d81c062a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -606,7 +606,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -662,7 +662,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, if (pgtable) pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); return true; } @@ -747,7 +747,7 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, if (pgtable) { pgtable_trans_huge_deposit(mm, pmd, pgtable); - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); } set_pmd_at(mm, addr, pmd, entry); @@ -978,7 +978,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); pmdp_set_wrprotect(src_mm, addr, src_pmd); @@ -1695,7 +1695,7 @@ static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) pgtable = pgtable_trans_huge_withdraw(mm, pmd); pte_free(mm, pgtable); - atomic_long_dec(&mm->nr_ptes); + mm_dec_nr_ptes(mm); } int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 43cb3043311b..ea4ff259b671 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1270,7 +1270,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) _pmd = pmdp_collapse_flush(vma, addr, pmd); spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); + mm_dec_nr_ptes(vma->vm_mm); pte_free(vma->vm_mm, pmd_pgtable(_pmd)); } } diff --git a/mm/memory.c b/mm/memory.c index 6bbd4078ec98..6dec21b182b0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -438,7 +438,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -666,7 +666,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - atomic_long_inc(&mm->nr_ptes); + mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } @@ -3238,7 +3238,7 @@ static int pte_alloc_one_map(struct vm_fault *vmf) goto map_pte; } - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; @@ -3297,7 +3297,7 @@ static void deposit_prealloc_pte(struct vm_fault *vmf) * We are going to consume the prealloc table, * count that as nr_ptes. */ - atomic_long_inc(&vma->vm_mm->nr_ptes); + mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f642a45b7f14..f9300141480e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -221,8 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm) + - mm_nr_puds(p->mm); + mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm); task_unlock(p); /* @@ -417,7 +416,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - atomic_long_read(&task->mm->nr_ptes), + mm_nr_ptes(task->mm), mm_nr_pmds(task->mm), mm_nr_puds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), From af5b0f6a09e42c9f4fa87735f2a366748767b686 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 15 Nov 2017 17:35:40 -0800 Subject: [PATCH 073/131] mm: consolidate page table accounting Currently, we account page tables separately for each page table level, but that's redundant -- we only make use of total memory allocated to page tables for oom_badness calculation. We also provide the information to userspace, but it has dubious value there too. This patch switches page table accounting to single counter. mm->pgtables_bytes is now used to account all page table levels. We use bytes, because page table size for different levels of page table tree may be different. The change has user-visible effect: we don't have VmPMD and VmPUD reported in /proc/[pid]/status. Not sure if anybody uses them. (As alternative, we can always report 0 kB for them.) OOM-killer report is also slightly changed: we now report pgtables_bytes instead of nr_ptes, nr_pmd, nr_puds. Apart from reducing number of counters per-mm, the benefit is that we now calculate oom_badness() more correctly for machines which have different size of page tables depending on level or where page tables are less than a page in size. The only downside can be debuggability because we do not know which page table level could leak. But I do not remember many bugs that would be caught by separate counters so I wouldn't lose sleep over this. [akpm@linux-foundation.org: fix mm/huge_memory.c] Link: http://lkml.kernel.org/r/20171006100651.44742-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko [kirill.shutemov@linux.intel.com: fix build] Link: http://lkml.kernel.org/r/20171016150113.ikfxy3e7zzfvsr4w@black.fi.intel.com Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 1 - Documentation/sysctl/vm.txt | 8 ++--- fs/proc/task_mmu.c | 11 ++---- include/linux/mm.h | 58 +++++++----------------------- include/linux/mm_types.h | 8 +---- kernel/fork.c | 16 +++------ mm/debug.c | 7 ++-- mm/huge_memory.c | 2 +- mm/oom_kill.c | 14 ++++---- 9 files changed, 32 insertions(+), 93 deletions(-) diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index adba21b5ada7..ec571b9bb18a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -250,7 +250,6 @@ Table 1-2: Contents of the status files (as of 4.8) VmExe size of text segment VmLib size of shared library code VmPTE size of page table entries - VmPMD size of second level page tables VmSwap amount of swap used by anonymous private data (shmem swap usage is not included) HugetlbPages size of hugetlb memory portions diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 2729e8db9492..3e579740b49f 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -629,10 +629,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such information as -pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, nr_puds, swapents, -oom_score_adj score, and name. This is helpful to determine why the OOM -killer was invoked, to identify the rogue task that caused it, and to -determine why the OOM killer chose the task it did to kill. +pid, uid, tgid, vm size, rss, pgtables_bytes, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9bd2a0294ac1..875231c36cb3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -26,7 +26,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long text, lib, swap, ptes, pmds, puds, anon, file, shmem; + unsigned long text, lib, swap, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; anon = get_mm_counter(mm, MM_ANONPAGES); @@ -50,9 +50,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); - ptes = PTRS_PER_PTE * sizeof(pte_t) * mm_nr_ptes(mm); - pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); - puds = PTRS_PER_PUD * sizeof(pud_t) * mm_nr_puds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -68,8 +65,6 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" - "VmPMD:\t%8lu kB\n" - "VmPUD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -82,9 +77,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - ptes >> 10, - pmds >> 10, - puds >> 10, + mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); } diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ca799f0d762..7c1e82a1aa77 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1605,37 +1605,20 @@ static inline int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, { return 0; } - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return 0; -} - -static inline void mm_nr_puds_init(struct mm_struct *mm) {} static inline void mm_inc_nr_puds(struct mm_struct *mm) {} static inline void mm_dec_nr_puds(struct mm_struct *mm) {} #else int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address); -static inline void mm_nr_puds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_puds, 0); -} - -static inline unsigned long mm_nr_puds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_puds); -} - static inline void mm_inc_nr_puds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_puds); + atomic_long_add(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_puds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_puds); + atomic_long_sub(PTRS_PER_PUD * sizeof(pud_t), &mm->pgtables_bytes); } #endif @@ -1646,64 +1629,47 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, return 0; } -static inline void mm_nr_pmds_init(struct mm_struct *mm) {} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return 0; -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); -static inline void mm_nr_pmds_init(struct mm_struct *mm) -{ - atomic_long_set(&mm->nr_pmds, 0); -} - -static inline unsigned long mm_nr_pmds(const struct mm_struct *mm) -{ - return atomic_long_read(&mm->nr_pmds); -} - static inline void mm_inc_nr_pmds(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_pmds); + atomic_long_add(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_pmds(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_pmds); + atomic_long_sub(PTRS_PER_PMD * sizeof(pmd_t), &mm->pgtables_bytes); } #endif #ifdef CONFIG_MMU -static inline void mm_nr_ptes_init(struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) { - atomic_long_set(&mm->nr_ptes, 0); + atomic_long_set(&mm->pgtables_bytes, 0); } -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { - return atomic_long_read(&mm->nr_ptes); + return atomic_long_read(&mm->pgtables_bytes); } static inline void mm_inc_nr_ptes(struct mm_struct *mm) { - atomic_long_inc(&mm->nr_ptes); + atomic_long_add(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } static inline void mm_dec_nr_ptes(struct mm_struct *mm) { - atomic_long_dec(&mm->nr_ptes); + atomic_long_sub(PTRS_PER_PTE * sizeof(pte_t), &mm->pgtables_bytes); } #else -static inline void mm_nr_ptes_init(struct mm_struct *mm) {} -static inline unsigned long mm_nr_ptes(const struct mm_struct *mm) +static inline void mm_pgtables_bytes_init(struct mm_struct *mm) {} +static inline unsigned long mm_pgtables_bytes(const struct mm_struct *mm) { return 0; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e42048020664..09643e0472fc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -402,13 +402,7 @@ struct mm_struct { atomic_t mm_count; #ifdef CONFIG_MMU - atomic_long_t nr_ptes; /* PTE page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 2 - atomic_long_t nr_pmds; /* PMD page table pages */ -#endif -#if CONFIG_PGTABLE_LEVELS > 3 - atomic_long_t nr_puds; /* PUD page table pages */ + atomic_long_t pgtables_bytes; /* PTE page table pages */ #endif int map_count; /* number of VMAs */ diff --git a/kernel/fork.c b/kernel/fork.c index 946922a30ede..006dc5899a1a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -817,9 +817,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; - mm_nr_ptes_init(mm); - mm_nr_pmds_init(mm); - mm_nr_puds_init(mm); + mm_pgtables_bytes_init(mm); mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; @@ -873,15 +871,9 @@ static void check_mm(struct mm_struct *mm) "mm:%p idx:%d val:%ld\n", mm, i, x); } - if (mm_nr_ptes(mm)) - pr_alert("BUG: non-zero nr_ptes on freeing mm: %ld\n", - mm_nr_ptes(mm)); - if (mm_nr_pmds(mm)) - pr_alert("BUG: non-zero nr_pmds on freeing mm: %ld\n", - mm_nr_pmds(mm)); - if (mm_nr_puds(mm)) - pr_alert("BUG: non-zero nr_puds on freeing mm: %ld\n", - mm_nr_puds(mm)); + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS VM_BUG_ON_MM(mm->pmd_huge_pte, mm); diff --git a/mm/debug.c b/mm/debug.c index c9888a6d7875..d947f3e03b0d 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -105,8 +105,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d\n" - "nr_ptes %lu nr_pmds %lu nr_puds %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx data_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -136,9 +135,7 @@ void dump_mm(const struct mm_struct *mm) mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), - mm_nr_ptes(mm), - mm_nr_pmds(mm), - mm_nr_puds(mm), + mm_pgtables_bytes(mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->data_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3610d81c062a..86fe697e8bfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -942,7 +942,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, set_pmd_at(src_mm, addr, src_pmd, pmd); } add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&dst_mm->nr_ptes); + mm_inc_nr_ptes(dst_mm); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); ret = 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f9300141480e..26add8a0d1f7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -221,7 +221,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * task's rss, pagetable and swap space use. */ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + - mm_nr_ptes(p->mm) + mm_nr_pmds(p->mm) + mm_nr_puds(p->mm); + mm_pgtables_bytes(p->mm) / PAGE_SIZE; task_unlock(p); /* @@ -389,15 +389,15 @@ static void select_bad_process(struct oom_control *oc) * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, - * swapents, oom_score_adj value, and name. + * State information includes task's pid, uid, tgid, vm size, rss, + * pgtables_bytes, swapents, oom_score_adj value, and name. */ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) { struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds nr_puds swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -413,12 +413,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - mm_nr_ptes(task->mm), - mm_nr_pmds(task->mm), - mm_nr_puds(task->mm), + mm_pgtables_bytes(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); From f3f7c09355784b02e49853dd1403d0e2d4f991c6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 15 Nov 2017 17:35:44 -0800 Subject: [PATCH 074/131] fs, mm: account filp cache to kmemcg The allocations from filp cache can be directly triggered by userspace applications. A buggy application can consume a significant amount of unaccounted system memory. Though we have not noticed such buggy applications in our production but upon close inspection, we found that a lot of machines spend very significant amount of memory on these caches. One way to limit allocations from filp cache is to set system level limit of maximum number of open files. However this limit is shared between different users on the system and one user can hog this resource. To cater that, we can charge filp to kmemcg and set the maximum limit very high and let the memory limit of each user limit the number of files they can open and indirectly limiting their allocations from filp cache. One side effect of this change is that it will allow _sysctl() to return ENOMEM and the man page of _sysctl() does not specify that. However the man page also discourages to use _sysctl() at all. Link: http://lkml.kernel.org/r/20171011190359.34926-1-shakeelb@google.com Signed-off-by: Shakeel Butt Cc: Alexander Viro Cc: Vladimir Davydov Cc: Michal Hocko Cc: Greg Thelen Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/file_table.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/file_table.c b/fs/file_table.c index 49e1f2f1a4cb..2dc9f38bd195 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -312,7 +312,7 @@ void put_filp(struct file *file) void __init files_init(void) { filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, - SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL); percpu_counter_init(&nr_files, 0, GFP_KERNEL); } From cdb07bdea28ebf1286a979501620745680596365 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 15 Nov 2017 17:35:47 -0800 Subject: [PATCH 075/131] mm/rmap.c: remove redundant variable cend Variable cend is set but never read, hence it is redundant and can be removed. Cleans up clang build warning: Value stored to 'cend' is never read Link: http://lkml.kernel.org/r/20171011174942.1372-1-colin.king@canonical.com Fixes: 369ea8242c0f ("mm/rmap: update to new mmu_notifier semantic v2") Signed-off-by: Colin Ian King Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 7dfc0975de4b..6b5a0f219ac0 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -899,7 +899,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(vma->vm_mm, start, end); while (page_vma_mapped_walk(&pvmw)) { - unsigned long cstart, cend; + unsigned long cstart; int ret = 0; cstart = address = pvmw.address; @@ -915,7 +915,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pte_wrprotect(entry); entry = pte_mkclean(entry); set_pte_at(vma->vm_mm, address, pte, entry); - cend = cstart + PAGE_SIZE; ret = 1; } else { #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE @@ -931,7 +930,6 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, entry = pmd_mkclean(entry); set_pmd_at(vma->vm_mm, address, pmd, entry); cstart &= PMD_MASK; - cend = cstart + PMD_SIZE; ret = 1; #else /* unexpected pmd-mapped page? */ From 4950276672fce5c241857540f8561c440663673d Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:51 -0800 Subject: [PATCH 076/131] kmemcheck: remove annotations Patch series "kmemcheck: kill kmemcheck", v2. As discussed at LSF/MM, kill kmemcheck. KASan is a replacement that is able to work without the limitation of kmemcheck (single CPU, slow). KASan is already upstream. We are also not aware of any users of kmemcheck (or users who don't consider KASan as a suitable replacement). The only objection was that since KASAN wasn't supported by all GCC versions provided by distros at that time we should hold off for 2 years, and try again. Now that 2 years have passed, and all distros provide gcc that supports KASAN, kill kmemcheck again for the very same reasons. This patch (of 4): Remove kmemcheck annotations, and calls to kmemcheck from the kernel. [alexander.levin@verizon.com: correctly remove kmemcheck call from dma_map_sg_attrs] Link: http://lkml.kernel.org/r/20171012192151.26531-1-alexander.levin@verizon.com Link: http://lkml.kernel.org/r/20171007030159.22241-2-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/dma-iommu.h | 1 - arch/openrisc/include/asm/dma-mapping.h | 1 - arch/x86/Makefile | 5 ----- arch/x86/include/asm/dma-mapping.h | 1 - arch/x86/include/asm/xor.h | 5 +---- arch/x86/kernel/traps.c | 5 ----- arch/x86/mm/fault.c | 6 ------ drivers/char/random.c | 1 - drivers/misc/c2port/core.c | 2 -- fs/dcache.c | 2 -- include/linux/c2port.h | 4 ---- include/linux/dma-mapping.h | 8 +------- include/linux/filter.h | 2 -- include/linux/mm_types.h | 8 -------- include/linux/net.h | 3 --- include/linux/ring_buffer.h | 3 --- include/linux/skbuff.h | 3 --- include/net/inet_sock.h | 3 --- include/net/inet_timewait_sock.h | 4 ---- include/net/sock.h | 3 --- init/main.c | 1 - kernel/bpf/core.c | 6 ------ kernel/locking/lockdep.c | 3 --- kernel/trace/ring_buffer.c | 3 --- mm/kmemleak.c | 9 --------- mm/page_alloc.c | 14 -------------- mm/slab.c | 14 -------------- mm/slab.h | 2 -- mm/slub.c | 20 -------------------- net/core/skbuff.c | 5 ----- net/core/sock.c | 2 -- net/ipv4/inet_timewait_sock.c | 3 --- net/ipv4/tcp_input.c | 1 - net/socket.c | 1 - 34 files changed, 2 insertions(+), 152 deletions(-) diff --git a/arch/arm/include/asm/dma-iommu.h b/arch/arm/include/asm/dma-iommu.h index 0722ec6be692..6821f1249300 100644 --- a/arch/arm/include/asm/dma-iommu.h +++ b/arch/arm/include/asm/dma-iommu.h @@ -7,7 +7,6 @@ #include #include #include -#include #include #define ARM_MAPPING_ERROR (~(dma_addr_t)0x0) diff --git a/arch/openrisc/include/asm/dma-mapping.h b/arch/openrisc/include/asm/dma-mapping.h index f41bd3cb76d9..e212a1f0b6d2 100644 --- a/arch/openrisc/include/asm/dma-mapping.h +++ b/arch/openrisc/include/asm/dma-mapping.h @@ -23,7 +23,6 @@ */ #include -#include #include extern const struct dma_map_ops or1k_dma_map_ops; diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a20eacd9c7e9..3e73bc255e4e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -158,11 +158,6 @@ ifdef CONFIG_X86_X32 endif export CONFIG_X86_X32_ABI -# Don't unroll struct assignments with kmemcheck enabled -ifeq ($(CONFIG_KMEMCHECK),y) - KBUILD_CFLAGS += $(call cc-option,-fno-builtin-memcpy) -endif - # # If the function graph tracer is used with mcount instead of fentry, # '-maccumulate-outgoing-args' is needed to prevent a GCC bug diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h index 43cbe843de8d..0350d99bb8fd 100644 --- a/arch/x86/include/asm/dma-mapping.h +++ b/arch/x86/include/asm/dma-mapping.h @@ -7,7 +7,6 @@ * Documentation/DMA-API.txt for documentation. */ -#include #include #include #include diff --git a/arch/x86/include/asm/xor.h b/arch/x86/include/asm/xor.h index 1f5c5161ead6..45c8605467f1 100644 --- a/arch/x86/include/asm/xor.h +++ b/arch/x86/include/asm/xor.h @@ -1,7 +1,4 @@ -#ifdef CONFIG_KMEMCHECK -/* kmemcheck doesn't handle MMX/SSE/SSE2 instructions */ -# include -#elif !defined(_ASM_X86_XOR_H) +#ifndef _ASM_X86_XOR_H #define _ASM_X86_XOR_H /* diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b7b0f74a2150..989514c94a55 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -42,7 +42,6 @@ #include #endif -#include #include #include #include @@ -749,10 +748,6 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) if (!dr6 && user_mode(regs)) user_icebp = 1; - /* Catch kmemcheck conditions! */ - if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) - goto exit; - /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 3109ba6c6ede..78ca9a8ee454 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,7 +20,6 @@ #include /* boot_cpu_has, ... */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ -#include /* kmemcheck_*(), ... */ #include /* VSYSCALL_ADDR */ #include /* emulate_vsyscall */ #include /* struct vm86 */ @@ -1256,8 +1255,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, * Detect and handle instructions that would cause a page fault for * both a tracked kernel page and a userspace page. */ - if (kmemcheck_active(regs)) - kmemcheck_hide(regs); prefetchw(&mm->mmap_sem); if (unlikely(kmmio_fault(regs, address))) @@ -1280,9 +1277,6 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code, if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) { if (vmalloc_fault(address) >= 0) return; - - if (kmemcheck_fault(regs, address, error_code)) - return; } /* Can handle a stale RO->RW TLB: */ diff --git a/drivers/char/random.c b/drivers/char/random.c index 6c7ccac2679e..ec42c8bb9b0d 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -259,7 +259,6 @@ #include #include #include -#include #include #include #include diff --git a/drivers/misc/c2port/core.c b/drivers/misc/c2port/core.c index 1922cb8f6b88..1c5b7aec13d4 100644 --- a/drivers/misc/c2port/core.c +++ b/drivers/misc/c2port/core.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include @@ -904,7 +903,6 @@ struct c2port_device *c2port_device_register(char *name, return ERR_PTR(-EINVAL); c2dev = kmalloc(sizeof(struct c2port_device), GFP_KERNEL); - kmemcheck_annotate_bitfield(c2dev, flags); if (unlikely(!c2dev)) return ERR_PTR(-ENOMEM); diff --git a/fs/dcache.c b/fs/dcache.c index bcc9f6981569..5c7df1df81ff 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2705,8 +2705,6 @@ static void swap_names(struct dentry *dentry, struct dentry *target) */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); - kmemcheck_mark_initialized(dentry->d_iname, DNAME_INLINE_LEN); - kmemcheck_mark_initialized(target->d_iname, DNAME_INLINE_LEN); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); diff --git a/include/linux/c2port.h b/include/linux/c2port.h index 4efabcb51347..f2736348ca26 100644 --- a/include/linux/c2port.h +++ b/include/linux/c2port.h @@ -9,8 +9,6 @@ * the Free Software Foundation */ -#include - #define C2PORT_NAME_LEN 32 struct device; @@ -22,10 +20,8 @@ struct device; /* Main struct */ struct c2port_ops; struct c2port_device { - kmemcheck_bitfield_begin(flags); unsigned int access:1; unsigned int flash_access:1; - kmemcheck_bitfield_end(flags); int id; char name[C2PORT_NAME_LEN]; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index eee1499db396..e8f8e8fb244d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -9,7 +9,6 @@ #include #include #include -#include #include #include @@ -232,7 +231,6 @@ static inline dma_addr_t dma_map_single_attrs(struct device *dev, void *ptr, const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; - kmemcheck_mark_initialized(ptr, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, virt_to_page(ptr), offset_in_page(ptr), size, @@ -265,11 +263,8 @@ static inline int dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, unsigned long attrs) { const struct dma_map_ops *ops = get_dma_ops(dev); - int i, ents; - struct scatterlist *s; + int ents; - for_each_sg(sg, s, nents, i) - kmemcheck_mark_initialized(sg_virt(s), s->length); BUG_ON(!valid_dma_direction(dir)); ents = ops->map_sg(dev, sg, nents, dir, attrs); BUG_ON(ents < 0); @@ -299,7 +294,6 @@ static inline dma_addr_t dma_map_page_attrs(struct device *dev, const struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; - kmemcheck_mark_initialized(page_address(page) + offset, size); BUG_ON(!valid_dma_direction(dir)); addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); diff --git a/include/linux/filter.h b/include/linux/filter.h index 48ec57e70f9f..42197b16dd78 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -454,13 +454,11 @@ struct bpf_binary_header { struct bpf_prog { u16 pages; /* Number of allocated pages */ - kmemcheck_bitfield_begin(meta); u16 jited:1, /* Is our filter JIT'ed? */ locked:1, /* Program image locked? */ gpl_compatible:1, /* Is filter GPL compatible? */ cb_access:1, /* Is control block accessed? */ dst_needed:1; /* Do we need dst entry? */ - kmemcheck_bitfield_end(meta); enum bpf_prog_type type; /* Type of BPF program */ u32 len; /* Number of filter blocks */ u32 jited_len; /* Size of jited insns in bytes */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 09643e0472fc..cfd0ac4e5e0e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -209,14 +209,6 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ -#ifdef CONFIG_KMEMCHECK - /* - * kmemcheck wants to track the status of each byte in a page; this - * is a pointer to such a status block. NULL if not tracked. - */ - void *shadow; -#endif - #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif diff --git a/include/linux/net.h b/include/linux/net.h index d97d80d7fdf8..caeb159abda5 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -22,7 +22,6 @@ #include #include #include /* For O_CLOEXEC and O_NONBLOCK */ -#include #include #include #include @@ -111,9 +110,7 @@ struct socket_wq { struct socket { socket_state state; - kmemcheck_bitfield_begin(type); short type; - kmemcheck_bitfield_end(type); unsigned long flags; diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fa6ace66fea5..289e4d54e3e0 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -2,7 +2,6 @@ #ifndef _LINUX_RING_BUFFER_H #define _LINUX_RING_BUFFER_H -#include #include #include #include @@ -14,9 +13,7 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { - kmemcheck_bitfield_begin(bitfield); u32 type_len:5, time_delta:27; - kmemcheck_bitfield_end(bitfield); u32 array[]; }; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d448a4804aea..aa1341474916 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -15,7 +15,6 @@ #define _LINUX_SKBUFF_H #include -#include #include #include #include @@ -704,7 +703,6 @@ struct sk_buff { /* Following fields are _not_ copied in __copy_skb_header() * Note that queue_mapping is here mostly to fill a hole. */ - kmemcheck_bitfield_begin(flags1); __u16 queue_mapping; /* if you move cloned around you also must adapt those constants */ @@ -723,7 +721,6 @@ struct sk_buff { head_frag:1, xmit_more:1, __unused:1; /* one bit hole */ - kmemcheck_bitfield_end(flags1); /* fields enclosed in headers_start/headers_end are copied * using a single memcpy() in __copy_skb_header() diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h index db8162dd8c0b..8e51b4a69088 100644 --- a/include/net/inet_sock.h +++ b/include/net/inet_sock.h @@ -17,7 +17,6 @@ #define _INET_SOCK_H #include -#include #include #include #include @@ -84,7 +83,6 @@ struct inet_request_sock { #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family - kmemcheck_bitfield_begin(flags); u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, @@ -93,7 +91,6 @@ struct inet_request_sock { ecn_ok : 1, acked : 1, no_srccheck: 1; - kmemcheck_bitfield_end(flags); u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 6a75d67a30fd..1356fa6a7566 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -15,8 +15,6 @@ #ifndef _INET_TIMEWAIT_SOCK_ #define _INET_TIMEWAIT_SOCK_ - -#include #include #include #include @@ -69,14 +67,12 @@ struct inet_timewait_sock { /* Socket demultiplex comparisons on incoming packets. */ /* these three are in inet_sock */ __be16 tw_sport; - kmemcheck_bitfield_begin(flags); /* And these are ours. */ unsigned int tw_kill : 1, tw_transparent : 1, tw_flowlabel : 20, tw_pad : 2, /* 2 bits hole */ tw_tos : 8; - kmemcheck_bitfield_end(flags); struct timer_list tw_timer; struct inet_bind_bucket *tw_tb; }; diff --git a/include/net/sock.h b/include/net/sock.h index c577286dbffb..a63e6a8bb7e0 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -436,7 +436,6 @@ struct sock { #define SK_FL_TYPE_MASK 0xffff0000 #endif - kmemcheck_bitfield_begin(flags); unsigned int sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, @@ -445,8 +444,6 @@ struct sock { sk_protocol : 8, sk_type : 16; #define SK_PROTOCOL_MAX U8_MAX - kmemcheck_bitfield_end(flags); - u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; diff --git a/init/main.c b/init/main.c index 3bdd8da90f69..859a786f7c0a 100644 --- a/init/main.c +++ b/init/main.c @@ -70,7 +70,6 @@ #include #include #include -#include #include #include #include diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 7b62df86be1d..11ad089f2c74 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -85,8 +85,6 @@ struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags) if (fp == NULL) return NULL; - kmemcheck_annotate_bitfield(fp, meta); - aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags); if (aux == NULL) { vfree(fp); @@ -127,8 +125,6 @@ struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size, if (fp == NULL) { __bpf_prog_uncharge(fp_old->aux->user, delta); } else { - kmemcheck_annotate_bitfield(fp, meta); - memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE); fp->pages = pages; fp->aux->prog = fp; @@ -662,8 +658,6 @@ static struct bpf_prog *bpf_prog_clone_create(struct bpf_prog *fp_other, fp = __vmalloc(fp_other->pages * PAGE_SIZE, gfp_flags, PAGE_KERNEL); if (fp != NULL) { - kmemcheck_annotate_bitfield(fp, meta); - /* aux->prog still points to the fp_other one, so * when promoting the clone to the real program, * this still needs to be adapted. diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index db933d063bfc..9776da8db180 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -3238,8 +3237,6 @@ static void __lockdep_init_map(struct lockdep_map *lock, const char *name, { int i; - kmemcheck_mark_initialized(lock, sizeof(*lock)); - for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) lock->class_cache[i] = NULL; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 845f3805c73d..d57fede84b38 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -13,7 +13,6 @@ #include #include #include /* for self test */ -#include #include #include #include @@ -2055,7 +2054,6 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, } event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); /* account for padding bytes */ local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes); @@ -2686,7 +2684,6 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* We reserved something on the buffer */ event = __rb_page_index(tail_page, tail); - kmemcheck_annotate_bitfield(event, bitfield); rb_update_event(cpu_buffer, event, info); local_inc(&tail_page->entries); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index fca3452e56c1..e4738d5e9b8c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -110,7 +110,6 @@ #include #include -#include #include #include @@ -1238,9 +1237,6 @@ static bool update_checksum(struct kmemleak_object *object) { u32 old_csum = object->checksum; - if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) - return false; - kasan_disable_current(); object->checksum = crc32(0, (void *)object->pointer, object->size); kasan_enable_current(); @@ -1314,11 +1310,6 @@ static void scan_block(void *_start, void *_end, if (scan_should_stop()) break; - /* don't scan uninitialized memory */ - if (!kmemcheck_is_obj_initialized((unsigned long)ptr, - BYTES_PER_POINTER)) - continue; - kasan_disable_current(); pointer = *ptr; kasan_enable_current(); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e6106d7e9eb0..30a464b47366 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include @@ -1013,7 +1012,6 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); - kmemcheck_free_shadow(page, order); /* * Check tail pages before head page information is cleared to @@ -2669,15 +2667,6 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON_PAGE(PageCompound(page), page); VM_BUG_ON_PAGE(!page_count(page), page); -#ifdef CONFIG_KMEMCHECK - /* - * Split shadow pages too, because free(page[0]) would - * otherwise free the whole shadow. - */ - if (kmemcheck_page_is_tracked(page)) - split_page(virt_to_page(page[0].shadow), order); -#endif - for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); split_page_owner(page, order); @@ -4223,9 +4212,6 @@ out: page = NULL; } - if (kmemcheck_enabled && page) - kmemcheck_pagealloc_alloc(page, order, gfp_mask); - trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype); return page; diff --git a/mm/slab.c b/mm/slab.c index 7a5e0888a401..c84365e9a591 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -114,7 +114,6 @@ #include #include #include -#include #include #include #include @@ -1433,15 +1432,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (sk_memalloc_socks() && page_is_pfmemalloc(page)) SetPageSlabPfmemalloc(page); - if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { - kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); - - if (cachep->ctor) - kmemcheck_mark_uninitialized_pages(page, nr_pages); - else - kmemcheck_mark_unallocated_pages(page, nr_pages); - } - return page; } @@ -1453,8 +1443,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) int order = cachep->gfporder; unsigned long nr_freed = (1 << order); - kmemcheck_free_shadow(page, order); - if (cachep->flags & SLAB_RECLAIM_ACCOUNT) mod_lruvec_page_state(page, NR_SLAB_RECLAIMABLE, -nr_freed); else @@ -3515,8 +3503,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp, kmemleak_free_recursive(objp, cachep->flags); objp = cache_free_debugcheck(cachep, objp, caller); - kmemcheck_slab_free(cachep, objp, cachep->object_size); - /* * Skip calling cache_free_alien() when the platform is not numa. * This will avoid cache misses that happen while accessing slabp (which diff --git a/mm/slab.h b/mm/slab.h index e19255638cb6..e60a3d1d8f6f 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -40,7 +40,6 @@ struct kmem_cache { #include #include -#include #include #include #include @@ -439,7 +438,6 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, for (i = 0; i < size; i++) { void *object = p[i]; - kmemcheck_slab_alloc(s, flags, object, slab_ksize(s)); kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags); kasan_slab_alloc(s, object, flags); diff --git a/mm/slub.c b/mm/slub.c index 51484f0fc068..ac3b50b9abec 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,7 +22,6 @@ #include #include #include -#include #include #include #include @@ -1377,7 +1376,6 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) unsigned long flags; local_irq_save(flags); - kmemcheck_slab_free(s, x, s->object_size); debug_check_no_locks_freed(x, s->object_size); local_irq_restore(flags); } @@ -1598,22 +1596,6 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) stat(s, ORDER_FALLBACK); } - if (kmemcheck_enabled && - !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { - int pages = 1 << oo_order(oo); - - kmemcheck_alloc_shadow(page, oo_order(oo), alloc_gfp, node); - - /* - * Objects from caches that have a constructor don't get - * cleared when they're allocated, so we need to do it here. - */ - if (s->ctor) - kmemcheck_mark_uninitialized_pages(page, pages); - else - kmemcheck_mark_unallocated_pages(page, pages); - } - page->objects = oo_objects(oo); order = compound_order(page); @@ -1689,8 +1671,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) check_object(s, page, p, SLUB_RED_INACTIVE); } - kmemcheck_free_shadow(page, compound_order(page)); - mod_lruvec_page_state(page, (s->flags & SLAB_RECLAIM_ACCOUNT) ? NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e140ba49b30a..6cd057b41f34 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include #include @@ -234,14 +233,12 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); if (flags & SKB_ALLOC_FCLONE) { struct sk_buff_fclones *fclones; fclones = container_of(skb, struct sk_buff_fclones, skb1); - kmemcheck_annotate_bitfield(&fclones->skb2, flags1); skb->fclone = SKB_FCLONE_ORIG; refcount_set(&fclones->fclone_ref, 1); @@ -301,7 +298,6 @@ struct sk_buff *__build_skb(void *data, unsigned int frag_size) shinfo = skb_shinfo(skb); memset(shinfo, 0, offsetof(struct skb_shared_info, dataref)); atomic_set(&shinfo->dataref, 1); - kmemcheck_annotate_variable(shinfo->destructor_arg); return skb; } @@ -1283,7 +1279,6 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) if (!n) return NULL; - kmemcheck_annotate_bitfield(n, flags1); n->fclone = SKB_FCLONE_UNAVAILABLE; } diff --git a/net/core/sock.c b/net/core/sock.c index 415f441c63b9..78401fa33ce8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1469,8 +1469,6 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, sk = kmalloc(prot->obj_size, priority); if (sk != NULL) { - kmemcheck_annotate_bitfield(sk, flags); - if (security_sk_alloc(sk, family, priority)) goto out_free; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 5b039159e67a..d451b9f19b59 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -9,7 +9,6 @@ */ #include -#include #include #include #include @@ -167,8 +166,6 @@ struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, if (tw) { const struct inet_sock *inet = inet_sk(sk); - kmemcheck_annotate_bitfield(tw, flags); - tw->tw_dr = dr; /* Give us an identity. */ tw->tw_daddr = inet->inet_daddr; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 887585045b27..c04d60a677a7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -6195,7 +6195,6 @@ struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, if (req) { struct inet_request_sock *ireq = inet_rsk(req); - kmemcheck_annotate_bitfield(ireq, flags); ireq->ireq_opt = NULL; #if IS_ENABLED(CONFIG_IPV6) ireq->pktopts = NULL; diff --git a/net/socket.c b/net/socket.c index c729625eb5d3..42d8e9c9ccd5 100644 --- a/net/socket.c +++ b/net/socket.c @@ -568,7 +568,6 @@ struct socket *sock_alloc(void) sock = SOCKET_I(inode); - kmemcheck_annotate_bitfield(sock, type); inode->i_ino = get_next_ino(); inode->i_mode = S_IFSOCK | S_IRWXUGO; inode->i_uid = current_fsuid(); From 75f296d93bcebcfe375884ddac79e30263a31766 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:54 -0800 Subject: [PATCH 077/131] kmemcheck: stop using GFP_NOTRACK and SLAB_NOTRACK Convert all allocations that used a NOTRACK flag to stop using it. Link: http://lkml.kernel.org/r/20171007030159.22241-3-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgalloc.h | 2 +- arch/arm64/include/asm/pgalloc.h | 2 +- arch/powerpc/include/asm/pgalloc.h | 2 +- arch/sh/kernel/dwarf.c | 4 ++-- arch/sh/kernel/process.c | 2 +- arch/sparc/mm/init_64.c | 4 ++-- arch/unicore32/include/asm/pgalloc.h | 2 +- arch/x86/kernel/espfix_64.c | 2 +- arch/x86/mm/init.c | 3 +-- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/pageattr.c | 10 +++++----- arch/x86/mm/pgtable.c | 2 +- arch/x86/platform/efi/efi_64.c | 2 +- crypto/xor.c | 7 +------ include/linux/thread_info.h | 5 ++--- init/do_mounts.c | 3 +-- kernel/fork.c | 12 ++++++------ kernel/signal.c | 3 +-- mm/kmemcheck.c | 2 +- mm/slab.c | 2 +- mm/slab.h | 5 ++--- mm/slab_common.c | 2 +- mm/slub.c | 4 +--- 23 files changed, 36 insertions(+), 48 deletions(-) diff --git a/arch/arm/include/asm/pgalloc.h b/arch/arm/include/asm/pgalloc.h index b2902a5cd780..2d7344f0e208 100644 --- a/arch/arm/include/asm/pgalloc.h +++ b/arch/arm/include/asm/pgalloc.h @@ -57,7 +57,7 @@ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) extern pgd_t *pgd_alloc(struct mm_struct *mm); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) static inline void clean_pte_table(pte_t *pte) { diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index d25f4f137c2a..5ca6a573a701 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -26,7 +26,7 @@ #define check_pgt_cache() do { } while (0) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #define PGD_SIZE (PTRS_PER_PGD * sizeof(pgd_t)) #if CONFIG_PGTABLE_LEVELS > 2 diff --git a/arch/powerpc/include/asm/pgalloc.h b/arch/powerpc/include/asm/pgalloc.h index a14203c005f1..e11f03007b57 100644 --- a/arch/powerpc/include/asm/pgalloc.h +++ b/arch/powerpc/include/asm/pgalloc.h @@ -18,7 +18,7 @@ static inline gfp_t pgtable_gfp_flags(struct mm_struct *mm, gfp_t gfp) } #endif /* MODULE */ -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) #ifdef CONFIG_PPC_BOOK3S #include diff --git a/arch/sh/kernel/dwarf.c b/arch/sh/kernel/dwarf.c index e1d751ae2498..1a2526676a87 100644 --- a/arch/sh/kernel/dwarf.c +++ b/arch/sh/kernel/dwarf.c @@ -1172,11 +1172,11 @@ static int __init dwarf_unwinder_init(void) dwarf_frame_cachep = kmem_cache_create("dwarf_frames", sizeof(struct dwarf_frame), 0, - SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); dwarf_reg_cachep = kmem_cache_create("dwarf_regs", sizeof(struct dwarf_reg), 0, - SLAB_PANIC | SLAB_HWCACHE_ALIGN | SLAB_NOTRACK, NULL); + SLAB_PANIC | SLAB_HWCACHE_ALIGN, NULL); dwarf_frame_pool = mempool_create_slab_pool(DWARF_FRAME_MIN_REQ, dwarf_frame_cachep); diff --git a/arch/sh/kernel/process.c b/arch/sh/kernel/process.c index b2d9963d5978..68b1a67533ce 100644 --- a/arch/sh/kernel/process.c +++ b/arch/sh/kernel/process.c @@ -59,7 +59,7 @@ void arch_task_cache_init(void) task_xstate_cachep = kmem_cache_create("task_xstate", xstate_size, __alignof__(union thread_xstate), - SLAB_PANIC | SLAB_NOTRACK, NULL); + SLAB_PANIC, NULL); } #ifdef CONFIG_SH_FPU_EMU diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 61bdc1270d19..2de22d703076 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2927,7 +2927,7 @@ void __flush_tlb_all(void) pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); pte_t *pte = NULL; if (page) @@ -2939,7 +2939,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page = alloc_page(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) return NULL; if (!pgtable_page_ctor(page)) { diff --git a/arch/unicore32/include/asm/pgalloc.h b/arch/unicore32/include/asm/pgalloc.h index 26775793c204..f0fdb268f8f2 100644 --- a/arch/unicore32/include/asm/pgalloc.h +++ b/arch/unicore32/include/asm/pgalloc.h @@ -28,7 +28,7 @@ extern void free_pgd_slow(struct mm_struct *mm, pgd_t *pgd); #define pgd_alloc(mm) get_pgd_slow(mm) #define pgd_free(mm, pgd) free_pgd_slow(mm, pgd) -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* * Allocate one PTE table. diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 7d7715dde901..e5ec3cafa72e 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -57,7 +57,7 @@ # error "Need more virtual address space for the ESPFIX hack" #endif -#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL | __GFP_ZERO) /* This contains the *bottom* address of the espfix stack */ DEFINE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a22c2b95e513..ef94620ceb8a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -92,8 +92,7 @@ __ref void *alloc_low_pages(unsigned int num) unsigned int order; order = get_order((unsigned long)num << PAGE_SHIFT); - return (void *)__get_free_pages(GFP_ATOMIC | __GFP_NOTRACK | - __GFP_ZERO, order); + return (void *)__get_free_pages(GFP_ATOMIC | __GFP_ZERO, order); } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index adcea90a2046..5fa3a58b5d78 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -184,7 +184,7 @@ static __ref void *spp_getpage(void) void *ptr; if (after_bootmem) - ptr = (void *) get_zeroed_page(GFP_ATOMIC | __GFP_NOTRACK); + ptr = (void *) get_zeroed_page(GFP_ATOMIC); else ptr = alloc_bootmem_pages(PAGE_SIZE); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3fe68483463c..85cf12219dea 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -753,7 +753,7 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte, if (!debug_pagealloc_enabled()) spin_unlock(&cpa_lock); - base = alloc_pages(GFP_KERNEL | __GFP_NOTRACK, 0); + base = alloc_pages(GFP_KERNEL, 0); if (!debug_pagealloc_enabled()) spin_lock(&cpa_lock); if (!base) @@ -904,7 +904,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) static int alloc_pte_page(pmd_t *pmd) { - pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL); if (!pte) return -1; @@ -914,7 +914,7 @@ static int alloc_pte_page(pmd_t *pmd) static int alloc_pmd_page(pud_t *pud) { - pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pmd_t *pmd = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd) return -1; @@ -1120,7 +1120,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgd_entry = cpa->pgd + pgd_index(addr); if (pgd_none(*pgd_entry)) { - p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + p4d = (p4d_t *)get_zeroed_page(GFP_KERNEL); if (!p4d) return -1; @@ -1132,7 +1132,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) */ p4d = p4d_offset(pgd_entry, addr); if (p4d_none(*p4d)) { - pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); + pud = (pud_t *)get_zeroed_page(GFP_KERNEL); if (!pud) return -1; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 17ebc5a978cc..96d456a94b03 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -7,7 +7,7 @@ #include #include -#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_ZERO) +#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 9e4ee5b04b2d..6a151ce70e86 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -207,7 +207,7 @@ int __init efi_alloc_page_tables(void) if (efi_enabled(EFI_OLD_MEMMAP)) return 0; - gfp_mask = GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO; + gfp_mask = GFP_KERNEL | __GFP_ZERO; efi_pgd = (pgd_t *)__get_free_page(gfp_mask); if (!efi_pgd) return -ENOMEM; diff --git a/crypto/xor.c b/crypto/xor.c index 263af9fb45ea..bce9fe7af40a 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -122,12 +122,7 @@ calibrate_xor_blocks(void) goto out; } - /* - * Note: Since the memory is not actually used for _anything_ but to - * test the XOR speed, we don't really want kmemcheck to warn about - * reading uninitialized bytes here. - */ - b1 = (void *) __get_free_pages(GFP_KERNEL | __GFP_NOTRACK, 2); + b1 = (void *) __get_free_pages(GFP_KERNEL, 2); if (!b1) { printk(KERN_WARNING "xor: Yikes! No memory available.\n"); return -ENOMEM; diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 4bcdf00c110f..34f053a150a9 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -44,10 +44,9 @@ enum { #endif #if IS_ENABLED(CONFIG_DEBUG_STACK_USAGE) || IS_ENABLED(CONFIG_DEBUG_KMEMLEAK) -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \ - __GFP_ZERO) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) #else -# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK) +# define THREADINFO_GFP (GFP_KERNEL_ACCOUNT) #endif /* diff --git a/init/do_mounts.c b/init/do_mounts.c index f6d4dd764a52..7cf4f6dafd5f 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -380,8 +380,7 @@ static int __init do_mount_root(char *name, char *fs, int flags, void *data) void __init mount_block_root(char *name, int flags) { - struct page *page = alloc_page(GFP_KERNEL | - __GFP_NOTRACK_FALSE_POSITIVE); + struct page *page = alloc_page(GFP_KERNEL); char *fs_names = page_address(page); char *p; #ifdef CONFIG_BLOCK diff --git a/kernel/fork.c b/kernel/fork.c index 006dc5899a1a..4e55eedba8d6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -469,7 +469,7 @@ void __init fork_init(void) /* create a slab on which task_structs can be allocated */ task_struct_cachep = kmem_cache_create("task_struct", arch_task_struct_size, align, - SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); + SLAB_PANIC|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -2205,18 +2205,18 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| - SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); + SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the @@ -2227,7 +2227,7 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); diff --git a/kernel/signal.c b/kernel/signal.c index 8dcd8825b2de..aa1fb9f905db 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1036,8 +1036,7 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t, else override_rlimit = 0; - q = __sigqueue_alloc(sig, t, GFP_ATOMIC | __GFP_NOTRACK_FALSE_POSITIVE, - override_rlimit); + q = __sigqueue_alloc(sig, t, GFP_ATOMIC, override_rlimit); if (q) { list_add_tail(&q->list, &pending->list); switch ((unsigned long) info) { diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index 800d64b854ea..b3a4d61d341c 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -18,7 +18,7 @@ void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) * With kmemcheck enabled, we need to allocate a memory area for the * shadow bits as well. */ - shadow = alloc_pages_node(node, flags | __GFP_NOTRACK, order); + shadow = alloc_pages_node(node, flags, order); if (!shadow) { if (printk_ratelimit()) pr_err("kmemcheck: failed to allocate shadow bitmap\n"); diff --git a/mm/slab.c b/mm/slab.c index c84365e9a591..183e996dde5f 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1410,7 +1410,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, flags |= cachep->allocflags; - page = __alloc_pages_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); + page = __alloc_pages_node(nodeid, flags, cachep->gfporder); if (!page) { slab_out_of_memory(cachep, flags, nodeid); return NULL; diff --git a/mm/slab.h b/mm/slab.h index e60a3d1d8f6f..ad657ffa44e5 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -141,10 +141,10 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size, #if defined(CONFIG_SLAB) #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_TEMPORARY | SLAB_ACCOUNT) #else #define SLAB_CACHE_FLAGS (0) #endif @@ -163,7 +163,6 @@ static inline slab_flags_t kmem_cache_flags(unsigned long object_size, SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ - SLAB_NOTRACK | \ SLAB_ACCOUNT) int __kmem_cache_shutdown(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 175e86637afd..c8cb36774ba1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -44,7 +44,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, SLAB_FAILSLAB | SLAB_KASAN) #define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \ - SLAB_NOTRACK | SLAB_ACCOUNT) + SLAB_ACCOUNT) /* * Merge control. If this is set then no merging of slab caches will occur. diff --git a/mm/slub.c b/mm/slub.c index ac3b50b9abec..91aa99b4b836 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1436,8 +1436,6 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s, struct page *page; int order = oo_order(oo); - flags |= __GFP_NOTRACK; - if (node == NUMA_NO_NODE) page = alloc_pages(flags, order); else @@ -3774,7 +3772,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK; + flags |= __GFP_COMP; page = alloc_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); From d8be75663cec0069b85f80191abd2682ce4a512f Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:35:58 -0800 Subject: [PATCH 078/131] kmemcheck: remove whats left of NOTRACK flags Now that kmemcheck is gone, we don't need the NOTRACK flags. Link: http://lkml.kernel.org/r/20171007030159.22241-5-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Alexander Potapenko Cc: Eric W. Biederman Cc: Michal Hocko Cc: Pekka Enberg Cc: Steven Rostedt Cc: Tim Hansen Cc: Vegard Nossum Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 5 ----- arch/x86/include/asm/pgtable_types.h | 13 ------------- include/linux/gfp.h | 9 --------- include/linux/slab.h | 6 ------ include/trace/events/mmflags.h | 1 - mm/slub.c | 2 -- tools/perf/builtin-kmem.c | 1 - 7 files changed, 37 deletions(-) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index f735c3016325..09f9e1e00e3b 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -667,11 +667,6 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return false; } -static inline int pte_hidden(pte_t pte) -{ - return pte_flags(pte) & _PAGE_HIDDEN; -} - static inline int pmd_present(pmd_t pmd) { /* diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 9e9b05fc4860..3696398a9475 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -32,7 +32,6 @@ #define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 #define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 -#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ #define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_DEVMAP _PAGE_BIT_SOFTW4 @@ -79,18 +78,6 @@ #define _PAGE_KNL_ERRATUM_MASK 0 #endif -#ifdef CONFIG_KMEMCHECK -#define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) -#else -#define _PAGE_HIDDEN (_AT(pteval_t, 0)) -#endif - -/* - * The same hidden bit is used by kmemcheck, but since kmemcheck - * works on kernel pages while soft-dirty engine on user space, - * they do not conflict with each other. - */ - #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 710143741eb5..b041f94678de 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -37,7 +37,6 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u #define ___GFP_ACCOUNT 0x100000u -#define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u #define ___GFP_WRITE 0x800000u #define ___GFP_KSWAPD_RECLAIM 0x1000000u @@ -201,19 +200,11 @@ struct vm_area_struct; * __GFP_COMP address compound page metadata. * * __GFP_ZERO returns a zeroed page on success. - * - * __GFP_NOTRACK avoids tracking with kmemcheck. - * - * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of - * distinguishing in the source between false positives and allocations that - * cannot be supported (e.g. page tables). */ #define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) -#define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) -#define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) diff --git a/include/linux/slab.h b/include/linux/slab.h index ebddfca9e24b..79f6532f8a0b 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -89,12 +89,6 @@ /* Avoid kmemleak tracing */ #define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) -/* Don't track use of uninitialized memory */ -#ifdef CONFIG_KMEMCHECK -# define SLAB_NOTRACK ((slab_flags_t __force)0x01000000U) -#else -# define SLAB_NOTRACK 0 -#endif /* Fault injection mark */ #ifdef CONFIG_FAILSLAB # define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 648cbf603736..72162f3a03fa 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -46,7 +46,6 @@ {(unsigned long)__GFP_RECLAIMABLE, "__GFP_RECLAIMABLE"}, \ {(unsigned long)__GFP_MOVABLE, "__GFP_MOVABLE"}, \ {(unsigned long)__GFP_ACCOUNT, "__GFP_ACCOUNT"}, \ - {(unsigned long)__GFP_NOTRACK, "__GFP_NOTRACK"}, \ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ diff --git a/mm/slub.c b/mm/slub.c index 91aa99b4b836..c2c41e178acf 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5635,8 +5635,6 @@ static char *create_unique_id(struct kmem_cache *s) *p++ = 'a'; if (s->flags & SLAB_CONSISTENCY_CHECKS) *p++ = 'F'; - if (!(s->flags & SLAB_NOTRACK)) - *p++ = 't'; if (s->flags & SLAB_ACCOUNT) *p++ = 'A'; if (p != name + 1) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 557d391f564a..cbf70738ef5f 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -655,7 +655,6 @@ static const struct { { "__GFP_RECLAIMABLE", "RC" }, { "__GFP_MOVABLE", "M" }, { "__GFP_ACCOUNT", "AC" }, - { "__GFP_NOTRACK", "NT" }, { "__GFP_WRITE", "WR" }, { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, From 4675ff05de2d76d167336b368bd07f3fef6ed5a6 Mon Sep 17 00:00:00 2001 From: "Levin, Alexander (Sasha Levin)" Date: Wed, 15 Nov 2017 17:36:02 -0800 Subject: [PATCH 079/131] kmemcheck: rip it out Fix up makefiles, remove references, and git rm kmemcheck. Link: http://lkml.kernel.org/r/20171007030159.22241-4-alexander.levin@verizon.com Signed-off-by: Sasha Levin Cc: Steven Rostedt Cc: Vegard Nossum Cc: Pekka Enberg Cc: Michal Hocko Cc: Eric W. Biederman Cc: Alexander Potapenko Cc: Tim Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../admin-guide/kernel-parameters.txt | 7 - Documentation/dev-tools/index.rst | 1 - Documentation/dev-tools/kmemcheck.rst | 733 ------------------ MAINTAINERS | 10 - arch/x86/Kconfig | 3 +- arch/x86/include/asm/kmemcheck.h | 42 - arch/x86/include/asm/string_32.h | 9 - arch/x86/include/asm/string_64.h | 8 - arch/x86/kernel/cpu/intel.c | 15 - arch/x86/mm/Makefile | 2 - arch/x86/mm/init.c | 5 +- arch/x86/mm/kmemcheck/Makefile | 1 - arch/x86/mm/kmemcheck/error.c | 227 ------ arch/x86/mm/kmemcheck/error.h | 15 - arch/x86/mm/kmemcheck/kmemcheck.c | 658 ---------------- arch/x86/mm/kmemcheck/opcode.c | 106 --- arch/x86/mm/kmemcheck/opcode.h | 9 - arch/x86/mm/kmemcheck/pte.c | 22 - arch/x86/mm/kmemcheck/pte.h | 10 - arch/x86/mm/kmemcheck/selftest.c | 70 -- arch/x86/mm/kmemcheck/selftest.h | 6 - arch/x86/mm/kmemcheck/shadow.c | 173 ----- arch/x86/mm/kmemcheck/shadow.h | 18 - include/linux/interrupt.h | 15 - include/linux/kmemcheck.h | 171 ---- kernel/softirq.c | 10 - kernel/sysctl.c | 10 - lib/Kconfig.debug | 6 +- lib/Kconfig.kmemcheck | 94 --- mm/Kconfig.debug | 1 - mm/Makefile | 2 - mm/kmemcheck.c | 125 --- mm/slub.c | 5 +- scripts/kernel-doc | 2 - tools/include/linux/kmemcheck.h | 8 - 35 files changed, 7 insertions(+), 2592 deletions(-) delete mode 100644 Documentation/dev-tools/kmemcheck.rst delete mode 100644 arch/x86/mm/kmemcheck/Makefile delete mode 100644 arch/x86/mm/kmemcheck/kmemcheck.c delete mode 100644 arch/x86/mm/kmemcheck/shadow.c delete mode 100644 lib/Kconfig.kmemcheck diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b74e13312fdc..00bb04972612 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1864,13 +1864,6 @@ Built with CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF=y, the default is off. - kmemcheck= [X86] Boot-time kmemcheck enable/disable/one-shot mode - Valid arguments: 0, 1, 2 - kmemcheck=0 (disabled) - kmemcheck=1 (enabled) - kmemcheck=2 (one-shot mode) - Default: 2 (one-shot mode) - kvm.ignore_msrs=[KVM] Ignore guest accesses to unhandled MSRs. Default is 0 (don't ignore, but inject #GP) diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index a81787cd47d7..e313925fb0fa 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -21,7 +21,6 @@ whole; patches welcome! kasan ubsan kmemleak - kmemcheck gdb-kernel-debugging kgdb kselftest diff --git a/Documentation/dev-tools/kmemcheck.rst b/Documentation/dev-tools/kmemcheck.rst deleted file mode 100644 index 7f3d1985de74..000000000000 --- a/Documentation/dev-tools/kmemcheck.rst +++ /dev/null @@ -1,733 +0,0 @@ -Getting started with kmemcheck -============================== - -Vegard Nossum - - -Introduction ------------- - -kmemcheck is a debugging feature for the Linux Kernel. More specifically, it -is a dynamic checker that detects and warns about some uses of uninitialized -memory. - -Userspace programmers might be familiar with Valgrind's memcheck. The main -difference between memcheck and kmemcheck is that memcheck works for userspace -programs only, and kmemcheck works for the kernel only. The implementations -are of course vastly different. Because of this, kmemcheck is not as accurate -as memcheck, but it turns out to be good enough in practice to discover real -programmer errors that the compiler is not able to find through static -analysis. - -Enabling kmemcheck on a kernel will probably slow it down to the extent that -the machine will not be usable for normal workloads such as e.g. an -interactive desktop. kmemcheck will also cause the kernel to use about twice -as much memory as normal. For this reason, kmemcheck is strictly a debugging -feature. - - -Downloading ------------ - -As of version 2.6.31-rc1, kmemcheck is included in the mainline kernel. - - -Configuring and compiling -------------------------- - -kmemcheck only works for the x86 (both 32- and 64-bit) platform. A number of -configuration variables must have specific settings in order for the kmemcheck -menu to even appear in "menuconfig". These are: - -- ``CONFIG_CC_OPTIMIZE_FOR_SIZE=n`` - This option is located under "General setup" / "Optimize for size". - - Without this, gcc will use certain optimizations that usually lead to - false positive warnings from kmemcheck. An example of this is a 16-bit - field in a struct, where gcc may load 32 bits, then discard the upper - 16 bits. kmemcheck sees only the 32-bit load, and may trigger a - warning for the upper 16 bits (if they're uninitialized). - -- ``CONFIG_SLAB=y`` or ``CONFIG_SLUB=y`` - This option is located under "General setup" / "Choose SLAB - allocator". - -- ``CONFIG_FUNCTION_TRACER=n`` - This option is located under "Kernel hacking" / "Tracers" / "Kernel - Function Tracer" - - When function tracing is compiled in, gcc emits a call to another - function at the beginning of every function. This means that when the - page fault handler is called, the ftrace framework will be called - before kmemcheck has had a chance to handle the fault. If ftrace then - modifies memory that was tracked by kmemcheck, the result is an - endless recursive page fault. - -- ``CONFIG_DEBUG_PAGEALLOC=n`` - This option is located under "Kernel hacking" / "Memory Debugging" - / "Debug page memory allocations". - -In addition, I highly recommend turning on ``CONFIG_DEBUG_INFO=y``. This is also -located under "Kernel hacking". With this, you will be able to get line number -information from the kmemcheck warnings, which is extremely valuable in -debugging a problem. This option is not mandatory, however, because it slows -down the compilation process and produces a much bigger kernel image. - -Now the kmemcheck menu should be visible (under "Kernel hacking" / "Memory -Debugging" / "kmemcheck: trap use of uninitialized memory"). Here follows -a description of the kmemcheck configuration variables: - -- ``CONFIG_KMEMCHECK`` - This must be enabled in order to use kmemcheck at all... - -- ``CONFIG_KMEMCHECK_``[``DISABLED`` | ``ENABLED`` | ``ONESHOT``]``_BY_DEFAULT`` - This option controls the status of kmemcheck at boot-time. "Enabled" - will enable kmemcheck right from the start, "disabled" will boot the - kernel as normal (but with the kmemcheck code compiled in, so it can - be enabled at run-time after the kernel has booted), and "one-shot" is - a special mode which will turn kmemcheck off automatically after - detecting the first use of uninitialized memory. - - If you are using kmemcheck to actively debug a problem, then you - probably want to choose "enabled" here. - - The one-shot mode is mostly useful in automated test setups because it - can prevent floods of warnings and increase the chances of the machine - surviving in case something is really wrong. In other cases, the one- - shot mode could actually be counter-productive because it would turn - itself off at the very first error -- in the case of a false positive - too -- and this would come in the way of debugging the specific - problem you were interested in. - - If you would like to use your kernel as normal, but with a chance to - enable kmemcheck in case of some problem, it might be a good idea to - choose "disabled" here. When kmemcheck is disabled, most of the run- - time overhead is not incurred, and the kernel will be almost as fast - as normal. - -- ``CONFIG_KMEMCHECK_QUEUE_SIZE`` - Select the maximum number of error reports to store in an internal - (fixed-size) buffer. Since errors can occur virtually anywhere and in - any context, we need a temporary storage area which is guaranteed not - to generate any other page faults when accessed. The queue will be - emptied as soon as a tasklet may be scheduled. If the queue is full, - new error reports will be lost. - - The default value of 64 is probably fine. If some code produces more - than 64 errors within an irqs-off section, then the code is likely to - produce many, many more, too, and these additional reports seldom give - any more information (the first report is usually the most valuable - anyway). - - This number might have to be adjusted if you are not using serial - console or similar to capture the kernel log. If you are using the - "dmesg" command to save the log, then getting a lot of kmemcheck - warnings might overflow the kernel log itself, and the earlier reports - will get lost in that way instead. Try setting this to 10 or so on - such a setup. - -- ``CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT`` - Select the number of shadow bytes to save along with each entry of the - error-report queue. These bytes indicate what parts of an allocation - are initialized, uninitialized, etc. and will be displayed when an - error is detected to help the debugging of a particular problem. - - The number entered here is actually the logarithm of the number of - bytes that will be saved. So if you pick for example 5 here, kmemcheck - will save 2^5 = 32 bytes. - - The default value should be fine for debugging most problems. It also - fits nicely within 80 columns. - -- ``CONFIG_KMEMCHECK_PARTIAL_OK`` - This option (when enabled) works around certain GCC optimizations that - produce 32-bit reads from 16-bit variables where the upper 16 bits are - thrown away afterwards. - - The default value (enabled) is recommended. This may of course hide - some real errors, but disabling it would probably produce a lot of - false positives. - -- ``CONFIG_KMEMCHECK_BITOPS_OK`` - This option silences warnings that would be generated for bit-field - accesses where not all the bits are initialized at the same time. This - may also hide some real bugs. - - This option is probably obsolete, or it should be replaced with - the kmemcheck-/bitfield-annotations for the code in question. The - default value is therefore fine. - -Now compile the kernel as usual. - - -How to use ----------- - -Booting -~~~~~~~ - -First some information about the command-line options. There is only one -option specific to kmemcheck, and this is called "kmemcheck". It can be used -to override the default mode as chosen by the ``CONFIG_KMEMCHECK_*_BY_DEFAULT`` -option. Its possible settings are: - -- ``kmemcheck=0`` (disabled) -- ``kmemcheck=1`` (enabled) -- ``kmemcheck=2`` (one-shot mode) - -If SLUB debugging has been enabled in the kernel, it may take precedence over -kmemcheck in such a way that the slab caches which are under SLUB debugging -will not be tracked by kmemcheck. In order to ensure that this doesn't happen -(even though it shouldn't by default), use SLUB's boot option ``slub_debug``, -like this: ``slub_debug=-`` - -In fact, this option may also be used for fine-grained control over SLUB vs. -kmemcheck. For example, if the command line includes -``kmemcheck=1 slub_debug=,dentry``, then SLUB debugging will be used only -for the "dentry" slab cache, and with kmemcheck tracking all the other -caches. This is advanced usage, however, and is not generally recommended. - - -Run-time enable/disable -~~~~~~~~~~~~~~~~~~~~~~~ - -When the kernel has booted, it is possible to enable or disable kmemcheck at -run-time. WARNING: This feature is still experimental and may cause false -positive warnings to appear. Therefore, try not to use this. If you find that -it doesn't work properly (e.g. you see an unreasonable amount of warnings), I -will be happy to take bug reports. - -Use the file ``/proc/sys/kernel/kmemcheck`` for this purpose, e.g.:: - - $ echo 0 > /proc/sys/kernel/kmemcheck # disables kmemcheck - -The numbers are the same as for the ``kmemcheck=`` command-line option. - - -Debugging -~~~~~~~~~ - -A typical report will look something like this:: - - WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) - 80000000000000000000000000000000000000000088ffff0000000000000000 - i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u - ^ - - Pid: 1856, comm: ntpdate Not tainted 2.6.29-rc5 #264 945P-A - RIP: 0010:[] [] __dequeue_signal+0xc8/0x190 - RSP: 0018:ffff88003cdf7d98 EFLAGS: 00210002 - RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 - RDX: ffff88003e5d6018 RSI: ffff88003e5d6024 RDI: ffff88003cdf7e84 - RBP: ffff88003cdf7db8 R08: ffff88003e5d6000 R09: 0000000000000000 - R10: 0000000000000080 R11: 0000000000000000 R12: 000000000000000e - R13: ffff88003cdf7e78 R14: ffff88003d530710 R15: ffff88003d5a98c8 - FS: 0000000000000000(0000) GS:ffff880001982000(0063) knlGS:00000 - CS: 0010 DS: 002b ES: 002b CR0: 0000000080050033 - CR2: ffff88003f806ea0 CR3: 000000003c036000 CR4: 00000000000006a0 - DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 - DR3: 0000000000000000 DR6: 00000000ffff4ff0 DR7: 0000000000000400 - [] dequeue_signal+0x8e/0x170 - [] get_signal_to_deliver+0x98/0x390 - [] do_notify_resume+0xad/0x7d0 - [] int_signal+0x12/0x17 - [] 0xffffffffffffffff - -The single most valuable information in this report is the RIP (or EIP on 32- -bit) value. This will help us pinpoint exactly which instruction that caused -the warning. - -If your kernel was compiled with ``CONFIG_DEBUG_INFO=y``, then all we have to do -is give this address to the addr2line program, like this:: - - $ addr2line -e vmlinux -i ffffffff8104ede8 - arch/x86/include/asm/string_64.h:12 - include/asm-generic/siginfo.h:287 - kernel/signal.c:380 - kernel/signal.c:410 - -The "``-e vmlinux``" tells addr2line which file to look in. **IMPORTANT:** -This must be the vmlinux of the kernel that produced the warning in the -first place! If not, the line number information will almost certainly be -wrong. - -The "``-i``" tells addr2line to also print the line numbers of inlined -functions. In this case, the flag was very important, because otherwise, -it would only have printed the first line, which is just a call to -``memcpy()``, which could be called from a thousand places in the kernel, and -is therefore not very useful. These inlined functions would not show up in -the stack trace above, simply because the kernel doesn't load the extra -debugging information. This technique can of course be used with ordinary -kernel oopses as well. - -In this case, it's the caller of ``memcpy()`` that is interesting, and it can be -found in ``include/asm-generic/siginfo.h``, line 287:: - - 281 static inline void copy_siginfo(struct siginfo *to, struct siginfo *from) - 282 { - 283 if (from->si_code < 0) - 284 memcpy(to, from, sizeof(*to)); - 285 else - 286 /* _sigchld is currently the largest know union member */ - 287 memcpy(to, from, __ARCH_SI_PREAMBLE_SIZE + sizeof(from->_sifields._sigchld)); - 288 } - -Since this was a read (kmemcheck usually warns about reads only, though it can -warn about writes to unallocated or freed memory as well), it was probably the -"from" argument which contained some uninitialized bytes. Following the chain -of calls, we move upwards to see where "from" was allocated or initialized, -``kernel/signal.c``, line 380:: - - 359 static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) - 360 { - ... - 367 list_for_each_entry(q, &list->list, list) { - 368 if (q->info.si_signo == sig) { - 369 if (first) - 370 goto still_pending; - 371 first = q; - ... - 377 if (first) { - 378 still_pending: - 379 list_del_init(&first->list); - 380 copy_siginfo(info, &first->info); - 381 __sigqueue_free(first); - ... - 392 } - 393 } - -Here, it is ``&first->info`` that is being passed on to ``copy_siginfo()``. The -variable ``first`` was found on a list -- passed in as the second argument to -``collect_signal()``. We continue our journey through the stack, to figure out -where the item on "list" was allocated or initialized. We move to line 410:: - - 395 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, - 396 siginfo_t *info) - 397 { - ... - 410 collect_signal(sig, pending, info); - ... - 414 } - -Now we need to follow the ``pending`` pointer, since that is being passed on to -``collect_signal()`` as ``list``. At this point, we've run out of lines from the -"addr2line" output. Not to worry, we just paste the next addresses from the -kmemcheck stack dump, i.e.:: - - [] dequeue_signal+0x8e/0x170 - [] get_signal_to_deliver+0x98/0x390 - [] do_notify_resume+0xad/0x7d0 - [] int_signal+0x12/0x17 - - $ addr2line -e vmlinux -i ffffffff8104f04e ffffffff81050bd8 \ - ffffffff8100b87d ffffffff8100c7b5 - kernel/signal.c:446 - kernel/signal.c:1806 - arch/x86/kernel/signal.c:805 - arch/x86/kernel/signal.c:871 - arch/x86/kernel/entry_64.S:694 - -Remember that since these addresses were found on the stack and not as the -RIP value, they actually point to the _next_ instruction (they are return -addresses). This becomes obvious when we look at the code for line 446:: - - 422 int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) - 423 { - ... - 431 signr = __dequeue_signal(&tsk->signal->shared_pending, - 432 mask, info); - 433 /* - 434 * itimer signal ? - 435 * - 436 * itimers are process shared and we restart periodic - 437 * itimers in the signal delivery path to prevent DoS - 438 * attacks in the high resolution timer case. This is - 439 * compliant with the old way of self restarting - 440 * itimers, as the SIGALRM is a legacy signal and only - 441 * queued once. Changing the restart behaviour to - 442 * restart the timer in the signal dequeue path is - 443 * reducing the timer noise on heavy loaded !highres - 444 * systems too. - 445 */ - 446 if (unlikely(signr == SIGALRM)) { - ... - 489 } - -So instead of looking at 446, we should be looking at 431, which is the line -that executes just before 446. Here we see that what we are looking for is -``&tsk->signal->shared_pending``. - -Our next task is now to figure out which function that puts items on this -``shared_pending`` list. A crude, but efficient tool, is ``git grep``:: - - $ git grep -n 'shared_pending' kernel/ - ... - kernel/signal.c:828: pending = group ? &t->signal->shared_pending : &t->pending; - kernel/signal.c:1339: pending = group ? &t->signal->shared_pending : &t->pending; - ... - -There were more results, but none of them were related to list operations, -and these were the only assignments. We inspect the line numbers more closely -and find that this is indeed where items are being added to the list:: - - 816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - 817 int group) - 818 { - ... - 828 pending = group ? &t->signal->shared_pending : &t->pending; - ... - 851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - 852 (is_si_special(info) || - 853 info->si_code >= 0))); - 854 if (q) { - 855 list_add_tail(&q->list, &pending->list); - ... - 890 } - -and:: - - 1309 int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group) - 1310 { - .... - 1339 pending = group ? &t->signal->shared_pending : &t->pending; - 1340 list_add_tail(&q->list, &pending->list); - .... - 1347 } - -In the first case, the list element we are looking for, ``q``, is being -returned from the function ``__sigqueue_alloc()``, which looks like an -allocation function. Let's take a look at it:: - - 187 static struct sigqueue *__sigqueue_alloc(struct task_struct *t, gfp_t flags, - 188 int override_rlimit) - 189 { - 190 struct sigqueue *q = NULL; - 191 struct user_struct *user; - 192 - 193 /* - 194 * We won't get problems with the target's UID changing under us - 195 * because changing it requires RCU be used, and if t != current, the - 196 * caller must be holding the RCU readlock (by way of a spinlock) and - 197 * we use RCU protection here - 198 */ - 199 user = get_uid(__task_cred(t)->user); - 200 atomic_inc(&user->sigpending); - 201 if (override_rlimit || - 202 atomic_read(&user->sigpending) <= - 203 t->signal->rlim[RLIMIT_SIGPENDING].rlim_cur) - 204 q = kmem_cache_alloc(sigqueue_cachep, flags); - 205 if (unlikely(q == NULL)) { - 206 atomic_dec(&user->sigpending); - 207 free_uid(user); - 208 } else { - 209 INIT_LIST_HEAD(&q->list); - 210 q->flags = 0; - 211 q->user = user; - 212 } - 213 - 214 return q; - 215 } - -We see that this function initializes ``q->list``, ``q->flags``, and -``q->user``. It seems that now is the time to look at the definition of -``struct sigqueue``, e.g.:: - - 14 struct sigqueue { - 15 struct list_head list; - 16 int flags; - 17 siginfo_t info; - 18 struct user_struct *user; - 19 }; - -And, you might remember, it was a ``memcpy()`` on ``&first->info`` that -caused the warning, so this makes perfect sense. It also seems reasonable -to assume that it is the caller of ``__sigqueue_alloc()`` that has the -responsibility of filling out (initializing) this member. - -But just which fields of the struct were uninitialized? Let's look at -kmemcheck's report again:: - - WARNING: kmemcheck: Caught 32-bit read from uninitialized memory (ffff88003e4a2024) - 80000000000000000000000000000000000000000088ffff0000000000000000 - i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u - ^ - -These first two lines are the memory dump of the memory object itself, and -the shadow bytemap, respectively. The memory object itself is in this case -``&first->info``. Just beware that the start of this dump is NOT the start -of the object itself! The position of the caret (^) corresponds with the -address of the read (ffff88003e4a2024). - -The shadow bytemap dump legend is as follows: - -- i: initialized -- u: uninitialized -- a: unallocated (memory has been allocated by the slab layer, but has not - yet been handed off to anybody) -- f: freed (memory has been allocated by the slab layer, but has been freed - by the previous owner) - -In order to figure out where (relative to the start of the object) the -uninitialized memory was located, we have to look at the disassembly. For -that, we'll need the RIP address again:: - - RIP: 0010:[] [] __dequeue_signal+0xc8/0x190 - - $ objdump -d --no-show-raw-insn vmlinux | grep -C 8 ffffffff8104ede8: - ffffffff8104edc8: mov %r8,0x8(%r8) - ffffffff8104edcc: test %r10d,%r10d - ffffffff8104edcf: js ffffffff8104ee88 <__dequeue_signal+0x168> - ffffffff8104edd5: mov %rax,%rdx - ffffffff8104edd8: mov $0xc,%ecx - ffffffff8104eddd: mov %r13,%rdi - ffffffff8104ede0: mov $0x30,%eax - ffffffff8104ede5: mov %rdx,%rsi - ffffffff8104ede8: rep movsl %ds:(%rsi),%es:(%rdi) - ffffffff8104edea: test $0x2,%al - ffffffff8104edec: je ffffffff8104edf0 <__dequeue_signal+0xd0> - ffffffff8104edee: movsw %ds:(%rsi),%es:(%rdi) - ffffffff8104edf0: test $0x1,%al - ffffffff8104edf2: je ffffffff8104edf5 <__dequeue_signal+0xd5> - ffffffff8104edf4: movsb %ds:(%rsi),%es:(%rdi) - ffffffff8104edf5: mov %r8,%rdi - ffffffff8104edf8: callq ffffffff8104de60 <__sigqueue_free> - -As expected, it's the "``rep movsl``" instruction from the ``memcpy()`` -that causes the warning. We know about ``REP MOVSL`` that it uses the register -``RCX`` to count the number of remaining iterations. By taking a look at the -register dump again (from the kmemcheck report), we can figure out how many -bytes were left to copy:: - - RAX: 0000000000000030 RBX: ffff88003d4ea968 RCX: 0000000000000009 - -By looking at the disassembly, we also see that ``%ecx`` is being loaded -with the value ``$0xc`` just before (ffffffff8104edd8), so we are very -lucky. Keep in mind that this is the number of iterations, not bytes. And -since this is a "long" operation, we need to multiply by 4 to get the -number of bytes. So this means that the uninitialized value was encountered -at 4 * (0xc - 0x9) = 12 bytes from the start of the object. - -We can now try to figure out which field of the "``struct siginfo``" that -was not initialized. This is the beginning of the struct:: - - 40 typedef struct siginfo { - 41 int si_signo; - 42 int si_errno; - 43 int si_code; - 44 - 45 union { - .. - 92 } _sifields; - 93 } siginfo_t; - -On 64-bit, the int is 4 bytes long, so it must the union member that has -not been initialized. We can verify this using gdb:: - - $ gdb vmlinux - ... - (gdb) p &((struct siginfo *) 0)->_sifields - $1 = (union {...} *) 0x10 - -Actually, it seems that the union member is located at offset 0x10 -- which -means that gcc has inserted 4 bytes of padding between the members ``si_code`` -and ``_sifields``. We can now get a fuller picture of the memory dump:: - - _----------------------------=> si_code - / _--------------------=> (padding) - | / _------------=> _sifields(._kill._pid) - | | / _----=> _sifields(._kill._uid) - | | | / - -------|-------|-------|-------| - 80000000000000000000000000000000000000000088ffff0000000000000000 - i i i i u u u u i i i i i i i i u u u u u u u u u u u u u u u u - -This allows us to realize another important fact: ``si_code`` contains the -value 0x80. Remember that x86 is little endian, so the first 4 bytes -"80000000" are really the number 0x00000080. With a bit of research, we -find that this is actually the constant ``SI_KERNEL`` defined in -``include/asm-generic/siginfo.h``:: - - 144 #define SI_KERNEL 0x80 /* sent by the kernel from somewhere */ - -This macro is used in exactly one place in the x86 kernel: In ``send_signal()`` -in ``kernel/signal.c``:: - - 816 static int send_signal(int sig, struct siginfo *info, struct task_struct *t, - 817 int group) - 818 { - ... - 828 pending = group ? &t->signal->shared_pending : &t->pending; - ... - 851 q = __sigqueue_alloc(t, GFP_ATOMIC, (sig < SIGRTMIN && - 852 (is_si_special(info) || - 853 info->si_code >= 0))); - 854 if (q) { - 855 list_add_tail(&q->list, &pending->list); - 856 switch ((unsigned long) info) { - ... - 865 case (unsigned long) SEND_SIG_PRIV: - 866 q->info.si_signo = sig; - 867 q->info.si_errno = 0; - 868 q->info.si_code = SI_KERNEL; - 869 q->info.si_pid = 0; - 870 q->info.si_uid = 0; - 871 break; - ... - 890 } - -Not only does this match with the ``.si_code`` member, it also matches the place -we found earlier when looking for where siginfo_t objects are enqueued on the -``shared_pending`` list. - -So to sum up: It seems that it is the padding introduced by the compiler -between two struct fields that is uninitialized, and this gets reported when -we do a ``memcpy()`` on the struct. This means that we have identified a false -positive warning. - -Normally, kmemcheck will not report uninitialized accesses in ``memcpy()`` calls -when both the source and destination addresses are tracked. (Instead, we copy -the shadow bytemap as well). In this case, the destination address clearly -was not tracked. We can dig a little deeper into the stack trace from above:: - - arch/x86/kernel/signal.c:805 - arch/x86/kernel/signal.c:871 - arch/x86/kernel/entry_64.S:694 - -And we clearly see that the destination siginfo object is located on the -stack:: - - 782 static void do_signal(struct pt_regs *regs) - 783 { - 784 struct k_sigaction ka; - 785 siginfo_t info; - ... - 804 signr = get_signal_to_deliver(&info, &ka, regs, NULL); - ... - 854 } - -And this ``&info`` is what eventually gets passed to ``copy_siginfo()`` as the -destination argument. - -Now, even though we didn't find an actual error here, the example is still a -good one, because it shows how one would go about to find out what the report -was all about. - - -Annotating false positives -~~~~~~~~~~~~~~~~~~~~~~~~~~ - -There are a few different ways to make annotations in the source code that -will keep kmemcheck from checking and reporting certain allocations. Here -they are: - -- ``__GFP_NOTRACK_FALSE_POSITIVE`` - This flag can be passed to ``kmalloc()`` or ``kmem_cache_alloc()`` - (therefore also to other functions that end up calling one of - these) to indicate that the allocation should not be tracked - because it would lead to a false positive report. This is a "big - hammer" way of silencing kmemcheck; after all, even if the false - positive pertains to particular field in a struct, for example, we - will now lose the ability to find (real) errors in other parts of - the same struct. - - Example:: - - /* No warnings will ever trigger on accessing any part of x */ - x = kmalloc(sizeof *x, GFP_KERNEL | __GFP_NOTRACK_FALSE_POSITIVE); - -- ``kmemcheck_bitfield_begin(name)``/``kmemcheck_bitfield_end(name)`` and - ``kmemcheck_annotate_bitfield(ptr, name)`` - The first two of these three macros can be used inside struct - definitions to signal, respectively, the beginning and end of a - bitfield. Additionally, this will assign the bitfield a name, which - is given as an argument to the macros. - - Having used these markers, one can later use - kmemcheck_annotate_bitfield() at the point of allocation, to indicate - which parts of the allocation is part of a bitfield. - - Example:: - - struct foo { - int x; - - kmemcheck_bitfield_begin(flags); - int flag_a:1; - int flag_b:1; - kmemcheck_bitfield_end(flags); - - int y; - }; - - struct foo *x = kmalloc(sizeof *x); - - /* No warnings will trigger on accessing the bitfield of x */ - kmemcheck_annotate_bitfield(x, flags); - - Note that ``kmemcheck_annotate_bitfield()`` can be used even before the - return value of ``kmalloc()`` is checked -- in other words, passing NULL - as the first argument is legal (and will do nothing). - - -Reporting errors ----------------- - -As we have seen, kmemcheck will produce false positive reports. Therefore, it -is not very wise to blindly post kmemcheck warnings to mailing lists and -maintainers. Instead, I encourage maintainers and developers to find errors -in their own code. If you get a warning, you can try to work around it, try -to figure out if it's a real error or not, or simply ignore it. Most -developers know their own code and will quickly and efficiently determine the -root cause of a kmemcheck report. This is therefore also the most efficient -way to work with kmemcheck. - -That said, we (the kmemcheck maintainers) will always be on the lookout for -false positives that we can annotate and silence. So whatever you find, -please drop us a note privately! Kernel configs and steps to reproduce (if -available) are of course a great help too. - -Happy hacking! - - -Technical description ---------------------- - -kmemcheck works by marking memory pages non-present. This means that whenever -somebody attempts to access the page, a page fault is generated. The page -fault handler notices that the page was in fact only hidden, and so it calls -on the kmemcheck code to make further investigations. - -When the investigations are completed, kmemcheck "shows" the page by marking -it present (as it would be under normal circumstances). This way, the -interrupted code can continue as usual. - -But after the instruction has been executed, we should hide the page again, so -that we can catch the next access too! Now kmemcheck makes use of a debugging -feature of the processor, namely single-stepping. When the processor has -finished the one instruction that generated the memory access, a debug -exception is raised. From here, we simply hide the page again and continue -execution, this time with the single-stepping feature turned off. - -kmemcheck requires some assistance from the memory allocator in order to work. -The memory allocator needs to - - 1. Tell kmemcheck about newly allocated pages and pages that are about to - be freed. This allows kmemcheck to set up and tear down the shadow memory - for the pages in question. The shadow memory stores the status of each - byte in the allocation proper, e.g. whether it is initialized or - uninitialized. - - 2. Tell kmemcheck which parts of memory should be marked uninitialized. - There are actually a few more states, such as "not yet allocated" and - "recently freed". - -If a slab cache is set up using the SLAB_NOTRACK flag, it will never return -memory that can take page faults because of kmemcheck. - -If a slab cache is NOT set up using the SLAB_NOTRACK flag, callers can still -request memory with the __GFP_NOTRACK or __GFP_NOTRACK_FALSE_POSITIVE flags. -This does not prevent the page faults from occurring, however, but marks the -object in question as being initialized so that no warnings will ever be -produced for this object. - -Currently, the SLAB and SLUB allocators are supported by kmemcheck. diff --git a/MAINTAINERS b/MAINTAINERS index 7e9c887ad951..ac814d3dd1c1 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7688,16 +7688,6 @@ F: include/linux/kdb.h F: include/linux/kgdb.h F: kernel/debug/ -KMEMCHECK -M: Vegard Nossum -M: Pekka Enberg -S: Maintained -F: Documentation/dev-tools/kmemcheck.rst -F: arch/x86/include/asm/kmemcheck.h -F: arch/x86/mm/kmemcheck/ -F: include/linux/kmemcheck.h -F: mm/kmemcheck.c - KMEMLEAK M: Catalin Marinas S: Maintained diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f08977d82ca0..cb678192da4a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -112,7 +112,6 @@ config X86 select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP select HAVE_ARCH_KGDB - select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_COMPAT_MMAP_BASES if MMU && COMPAT @@ -1430,7 +1429,7 @@ config ARCH_DMA_ADDR_T_64BIT config X86_DIRECT_GBPAGES def_bool y - depends on X86_64 && !DEBUG_PAGEALLOC && !KMEMCHECK + depends on X86_64 && !DEBUG_PAGEALLOC ---help--- Certain kernel features effectively disable kernel linear 1 GB mappings (even if the CPU otherwise diff --git a/arch/x86/include/asm/kmemcheck.h b/arch/x86/include/asm/kmemcheck.h index 945a0337fbcf..ea32a7d3cf1b 100644 --- a/arch/x86/include/asm/kmemcheck.h +++ b/arch/x86/include/asm/kmemcheck.h @@ -1,43 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ASM_X86_KMEMCHECK_H -#define ASM_X86_KMEMCHECK_H - -#include -#include - -#ifdef CONFIG_KMEMCHECK -bool kmemcheck_active(struct pt_regs *regs); - -void kmemcheck_show(struct pt_regs *regs); -void kmemcheck_hide(struct pt_regs *regs); - -bool kmemcheck_fault(struct pt_regs *regs, - unsigned long address, unsigned long error_code); -bool kmemcheck_trap(struct pt_regs *regs); -#else -static inline bool kmemcheck_active(struct pt_regs *regs) -{ - return false; -} - -static inline void kmemcheck_show(struct pt_regs *regs) -{ -} - -static inline void kmemcheck_hide(struct pt_regs *regs) -{ -} - -static inline bool kmemcheck_fault(struct pt_regs *regs, - unsigned long address, unsigned long error_code) -{ - return false; -} - -static inline bool kmemcheck_trap(struct pt_regs *regs) -{ - return false; -} -#endif /* CONFIG_KMEMCHECK */ - -#endif diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h index 076502241eae..55d392c6bd29 100644 --- a/arch/x86/include/asm/string_32.h +++ b/arch/x86/include/asm/string_32.h @@ -179,8 +179,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) * No 3D Now! */ -#ifndef CONFIG_KMEMCHECK - #if (__GNUC__ >= 4) #define memcpy(t, f, n) __builtin_memcpy(t, f, n) #else @@ -189,13 +187,6 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len) ? __constant_memcpy((t), (f), (n)) \ : __memcpy((t), (f), (n))) #endif -#else -/* - * kmemcheck becomes very happy if we use the REP instructions unconditionally, - * because it means that we know both memory operands in advance. - */ -#define memcpy(t, f, n) __memcpy((t), (f), (n)) -#endif #endif #endif /* !CONFIG_FORTIFY_SOURCE */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 0b1b4445f4c5..533f74c300c2 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -33,7 +33,6 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #ifndef CONFIG_FORTIFY_SOURCE -#ifndef CONFIG_KMEMCHECK #if (__GNUC__ == 4 && __GNUC_MINOR__ < 3) || __GNUC__ < 4 #define memcpy(dst, src, len) \ ({ \ @@ -46,13 +45,6 @@ extern void *__memcpy(void *to, const void *from, size_t len); __ret; \ }) #endif -#else -/* - * kmemcheck becomes very happy if we use the REP instructions unconditionally, - * because it means that we know both memory operands in advance. - */ -#define memcpy(dst, src, len) __inline_memcpy((dst), (src), (len)) -#endif #endif /* !CONFIG_FORTIFY_SOURCE */ #define __HAVE_ARCH_MEMSET diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index b720dacac051..b1af22073e28 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -187,21 +187,6 @@ static void early_init_intel(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model < 15) clear_cpu_cap(c, X86_FEATURE_PAT); -#ifdef CONFIG_KMEMCHECK - /* - * P4s have a "fast strings" feature which causes single- - * stepping REP instructions to only generate a #DB on - * cache-line boundaries. - * - * Ingo Molnar reported a Pentium D (model 6) and a Xeon - * (model 2) with the same problem. - */ - if (c->x86 == 15) - if (msr_clear_bit(MSR_IA32_MISC_ENABLE, - MSR_IA32_MISC_ENABLE_FAST_STRING_BIT) > 0) - pr_info("kmemcheck: Disabling fast string operations\n"); -#endif - /* * If fast string is not enabled in IA32_MISC_ENABLE for any reason, * clear the fast string and enhanced fast string CPU capabilities. diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 7ba7f3d7f477..8e13b8cc6bed 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -29,8 +29,6 @@ obj-$(CONFIG_X86_PTDUMP) += debug_pagetables.o obj-$(CONFIG_HIGHMEM) += highmem_32.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck/ - KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ef94620ceb8a..6fdf91ef130a 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -163,12 +163,11 @@ static int page_size_mask; static void __init probe_page_size_mask(void) { /* - * For CONFIG_KMEMCHECK or pagealloc debugging, identity mapping will - * use small pages. + * For pagealloc debugging, identity mapping will use small pages. * This will simplify cpa(), which otherwise needs to support splitting * large pages into small in interrupt context, etc. */ - if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled() && !IS_ENABLED(CONFIG_KMEMCHECK)) + if (boot_cpu_has(X86_FEATURE_PSE) && !debug_pagealloc_enabled()) page_size_mask |= 1 << PG_LEVEL_2M; else direct_gbpages = 0; diff --git a/arch/x86/mm/kmemcheck/Makefile b/arch/x86/mm/kmemcheck/Makefile deleted file mode 100644 index 520b3bce4095..000000000000 --- a/arch/x86/mm/kmemcheck/Makefile +++ /dev/null @@ -1 +0,0 @@ -obj-y := error.o kmemcheck.o opcode.o pte.o selftest.o shadow.o diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c index 872ec4159a68..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/error.c +++ b/arch/x86/mm/kmemcheck/error.c @@ -1,228 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include - -#include "error.h" -#include "shadow.h" - -enum kmemcheck_error_type { - KMEMCHECK_ERROR_INVALID_ACCESS, - KMEMCHECK_ERROR_BUG, -}; - -#define SHADOW_COPY_SIZE (1 << CONFIG_KMEMCHECK_SHADOW_COPY_SHIFT) - -struct kmemcheck_error { - enum kmemcheck_error_type type; - - union { - /* KMEMCHECK_ERROR_INVALID_ACCESS */ - struct { - /* Kind of access that caused the error */ - enum kmemcheck_shadow state; - /* Address and size of the erroneous read */ - unsigned long address; - unsigned int size; - }; - }; - - struct pt_regs regs; - struct stack_trace trace; - unsigned long trace_entries[32]; - - /* We compress it to a char. */ - unsigned char shadow_copy[SHADOW_COPY_SIZE]; - unsigned char memory_copy[SHADOW_COPY_SIZE]; -}; - -/* - * Create a ring queue of errors to output. We can't call printk() directly - * from the kmemcheck traps, since this may call the console drivers and - * result in a recursive fault. - */ -static struct kmemcheck_error error_fifo[CONFIG_KMEMCHECK_QUEUE_SIZE]; -static unsigned int error_count; -static unsigned int error_rd; -static unsigned int error_wr; -static unsigned int error_missed_count; - -static struct kmemcheck_error *error_next_wr(void) -{ - struct kmemcheck_error *e; - - if (error_count == ARRAY_SIZE(error_fifo)) { - ++error_missed_count; - return NULL; - } - - e = &error_fifo[error_wr]; - if (++error_wr == ARRAY_SIZE(error_fifo)) - error_wr = 0; - ++error_count; - return e; -} - -static struct kmemcheck_error *error_next_rd(void) -{ - struct kmemcheck_error *e; - - if (error_count == 0) - return NULL; - - e = &error_fifo[error_rd]; - if (++error_rd == ARRAY_SIZE(error_fifo)) - error_rd = 0; - --error_count; - return e; -} - -void kmemcheck_error_recall(void) -{ - static const char *desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = "unallocated", - [KMEMCHECK_SHADOW_UNINITIALIZED] = "uninitialized", - [KMEMCHECK_SHADOW_INITIALIZED] = "initialized", - [KMEMCHECK_SHADOW_FREED] = "freed", - }; - - static const char short_desc[] = { - [KMEMCHECK_SHADOW_UNALLOCATED] = 'a', - [KMEMCHECK_SHADOW_UNINITIALIZED] = 'u', - [KMEMCHECK_SHADOW_INITIALIZED] = 'i', - [KMEMCHECK_SHADOW_FREED] = 'f', - }; - - struct kmemcheck_error *e; - unsigned int i; - - e = error_next_rd(); - if (!e) - return; - - switch (e->type) { - case KMEMCHECK_ERROR_INVALID_ACCESS: - printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n", - 8 * e->size, e->state < ARRAY_SIZE(desc) ? - desc[e->state] : "(invalid shadow state)", - (void *) e->address); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) - printk(KERN_CONT "%02x", e->memory_copy[i]); - printk(KERN_CONT "\n"); - - printk(KERN_WARNING); - for (i = 0; i < SHADOW_COPY_SIZE; ++i) { - if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) - printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]); - else - printk(KERN_CONT " ?"); - } - printk(KERN_CONT "\n"); - printk(KERN_WARNING "%*c\n", 2 + 2 - * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); - break; - case KMEMCHECK_ERROR_BUG: - printk(KERN_EMERG "ERROR: kmemcheck: Fatal error\n"); - break; - } - - __show_regs(&e->regs, 1); - print_stack_trace(&e->trace, 0); -} - -static void do_wakeup(unsigned long data) -{ - while (error_count > 0) - kmemcheck_error_recall(); - - if (error_missed_count > 0) { - printk(KERN_WARNING "kmemcheck: Lost %d error reports because " - "the queue was too small\n", error_missed_count); - error_missed_count = 0; - } -} - -static DECLARE_TASKLET(kmemcheck_tasklet, &do_wakeup, 0); - -/* - * Save the context of an error report. - */ -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs) -{ - static unsigned long prev_ip; - - struct kmemcheck_error *e; - void *shadow_copy; - void *memory_copy; - - /* Don't report several adjacent errors from the same EIP. */ - if (regs->ip == prev_ip) - return; - prev_ip = regs->ip; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_INVALID_ACCESS; - - e->state = state; - e->address = address; - e->size = size; - - /* Save regs */ - memcpy(&e->regs, regs, sizeof(*regs)); - - /* Save stack trace */ - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 0; - save_stack_trace_regs(regs, &e->trace); - - /* Round address down to nearest 16 bytes */ - shadow_copy = kmemcheck_shadow_lookup(address - & ~(SHADOW_COPY_SIZE - 1)); - BUG_ON(!shadow_copy); - - memcpy(e->shadow_copy, shadow_copy, SHADOW_COPY_SIZE); - - kmemcheck_show_addr(address); - memory_copy = (void *) (address & ~(SHADOW_COPY_SIZE - 1)); - memcpy(e->memory_copy, memory_copy, SHADOW_COPY_SIZE); - kmemcheck_hide_addr(address); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} - -/* - * Save the context of a kmemcheck bug. - */ -void kmemcheck_error_save_bug(struct pt_regs *regs) -{ - struct kmemcheck_error *e; - - e = error_next_wr(); - if (!e) - return; - - e->type = KMEMCHECK_ERROR_BUG; - - memcpy(&e->regs, regs, sizeof(*regs)); - - e->trace.nr_entries = 0; - e->trace.entries = e->trace_entries; - e->trace.max_entries = ARRAY_SIZE(e->trace_entries); - e->trace.skip = 1; - save_stack_trace(&e->trace); - - tasklet_hi_schedule_first(&kmemcheck_tasklet); -} diff --git a/arch/x86/mm/kmemcheck/error.h b/arch/x86/mm/kmemcheck/error.h index 39f80d7a874d..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/error.h +++ b/arch/x86/mm/kmemcheck/error.h @@ -1,16 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__ERROR_H -#define ARCH__X86__MM__KMEMCHECK__ERROR_H - -#include - -#include "shadow.h" - -void kmemcheck_error_save(enum kmemcheck_shadow state, - unsigned long address, unsigned int size, struct pt_regs *regs); - -void kmemcheck_error_save_bug(struct pt_regs *regs); - -void kmemcheck_error_recall(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c deleted file mode 100644 index 4515bae36bbe..000000000000 --- a/arch/x86/mm/kmemcheck/kmemcheck.c +++ /dev/null @@ -1,658 +0,0 @@ -/** - * kmemcheck - a heavyweight memory checker for the linux kernel - * Copyright (C) 2007, 2008 Vegard Nossum - * (With a lot of help from Ingo Molnar and Pekka Enberg.) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License (version 2) as - * published by the Free Software Foundation. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "error.h" -#include "opcode.h" -#include "pte.h" -#include "selftest.h" -#include "shadow.h" - - -#ifdef CONFIG_KMEMCHECK_DISABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 0 -#endif - -#ifdef CONFIG_KMEMCHECK_ENABLED_BY_DEFAULT -# define KMEMCHECK_ENABLED 1 -#endif - -#ifdef CONFIG_KMEMCHECK_ONESHOT_BY_DEFAULT -# define KMEMCHECK_ENABLED 2 -#endif - -int kmemcheck_enabled = KMEMCHECK_ENABLED; - -int __init kmemcheck_init(void) -{ -#ifdef CONFIG_SMP - /* - * Limit SMP to use a single CPU. We rely on the fact that this code - * runs before SMP is set up. - */ - if (setup_max_cpus > 1) { - printk(KERN_INFO - "kmemcheck: Limiting number of CPUs to 1.\n"); - setup_max_cpus = 1; - } -#endif - - if (!kmemcheck_selftest()) { - printk(KERN_INFO "kmemcheck: self-tests failed; disabling\n"); - kmemcheck_enabled = 0; - return -EINVAL; - } - - printk(KERN_INFO "kmemcheck: Initialized\n"); - return 0; -} - -early_initcall(kmemcheck_init); - -/* - * We need to parse the kmemcheck= option before any memory is allocated. - */ -static int __init param_kmemcheck(char *str) -{ - int val; - int ret; - - if (!str) - return -EINVAL; - - ret = kstrtoint(str, 0, &val); - if (ret) - return ret; - kmemcheck_enabled = val; - return 0; -} - -early_param("kmemcheck", param_kmemcheck); - -int kmemcheck_show_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -int kmemcheck_hide_addr(unsigned long address) -{ - pte_t *pte; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return 0; - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - __flush_tlb_one(address); - return 1; -} - -struct kmemcheck_context { - bool busy; - int balance; - - /* - * There can be at most two memory operands to an instruction, but - * each address can cross a page boundary -- so we may need up to - * four addresses that must be hidden/revealed for each fault. - */ - unsigned long addr[4]; - unsigned long n_addrs; - unsigned long flags; - - /* Data size of the instruction that caused a fault. */ - unsigned int size; -}; - -static DEFINE_PER_CPU(struct kmemcheck_context, kmemcheck_context); - -bool kmemcheck_active(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - return data->balance > 0; -} - -/* Save an address that needs to be shown/hidden */ -static void kmemcheck_save_addr(unsigned long addr) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(data->n_addrs >= ARRAY_SIZE(data->addr)); - data->addr[data->n_addrs++] = addr; -} - -static unsigned int kmemcheck_show_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_show_addr(data->addr[i]); - - return n; -} - -static unsigned int kmemcheck_hide_all(void) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - unsigned int i; - unsigned int n; - - n = 0; - for (i = 0; i < data->n_addrs; ++i) - n += kmemcheck_hide_addr(data->addr[i]); - - return n; -} - -/* - * Called from the #PF handler. - */ -void kmemcheck_show(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 0)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->balance = 0; - return; - } - - /* - * None of the addresses actually belonged to kmemcheck. Note that - * this is not an error. - */ - if (kmemcheck_show_all() == 0) - return; - - ++data->balance; - - /* - * The IF needs to be cleared as well, so that the faulting - * instruction can run "uninterrupted". Otherwise, we might take - * an interrupt and start executing that before we've had a chance - * to hide the page again. - * - * NOTE: In the rare case of multiple faults, we must not override - * the original flags: - */ - if (!(regs->flags & X86_EFLAGS_TF)) - data->flags = regs->flags; - - regs->flags |= X86_EFLAGS_TF; - regs->flags &= ~X86_EFLAGS_IF; -} - -/* - * Called from the #DB handler. - */ -void kmemcheck_hide(struct pt_regs *regs) -{ - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - int n; - - BUG_ON(!irqs_disabled()); - - if (unlikely(data->balance != 1)) { - kmemcheck_show_all(); - kmemcheck_error_save_bug(regs); - data->n_addrs = 0; - data->balance = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; - return; - } - - if (kmemcheck_enabled) - n = kmemcheck_hide_all(); - else - n = kmemcheck_show_all(); - - if (n == 0) - return; - - --data->balance; - - data->n_addrs = 0; - - if (!(data->flags & X86_EFLAGS_TF)) - regs->flags &= ~X86_EFLAGS_TF; - if (data->flags & X86_EFLAGS_IF) - regs->flags |= X86_EFLAGS_IF; -} - -void kmemcheck_show_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) | _PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -bool kmemcheck_page_is_tracked(struct page *p) -{ - /* This will also check the "hidden" flag of the PTE. */ - return kmemcheck_pte_lookup((unsigned long) page_address(p)); -} - -void kmemcheck_hide_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) { - unsigned long address; - pte_t *pte; - unsigned int level; - - address = (unsigned long) page_address(&p[i]); - pte = lookup_address(address, &level); - BUG_ON(!pte); - BUG_ON(level != PG_LEVEL_4K); - - set_pte(pte, __pte(pte_val(*pte) & ~_PAGE_PRESENT)); - set_pte(pte, __pte(pte_val(*pte) | _PAGE_HIDDEN)); - __flush_tlb_one(address); - } -} - -/* Access may NOT cross page boundary */ -static void kmemcheck_read_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - enum kmemcheck_shadow status; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; - - /* Don't warn about it again. */ - kmemcheck_shadow_set(shadow, size); -} - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - enum kmemcheck_shadow status; - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return true; - - status = kmemcheck_shadow_test_all(shadow, size); - - return status == KMEMCHECK_SHADOW_INITIALIZED; -} - -/* Access may cross page boundary */ -static void kmemcheck_read(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_read_strict(regs, addr, size); - return; - } - - /* - * What we do is basically to split the access across the - * two pages and handle each part separately. Yes, this means - * that we may now see reads that are 3 + 5 bytes, for - * example (and if both are uninitialized, there will be two - * reports), but it makes the code a lot simpler. - */ - kmemcheck_read_strict(regs, addr, next_page - addr); - kmemcheck_read_strict(regs, next_page, next_addr - next_page); -} - -static void kmemcheck_write_strict(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - void *shadow; - - shadow = kmemcheck_shadow_lookup(addr); - if (!shadow) - return; - - kmemcheck_save_addr(addr); - kmemcheck_shadow_set(shadow, size); -} - -static void kmemcheck_write(struct pt_regs *regs, - unsigned long addr, unsigned int size) -{ - unsigned long page = addr & PAGE_MASK; - unsigned long next_addr = addr + size - 1; - unsigned long next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - kmemcheck_write_strict(regs, addr, size); - return; - } - - /* See comment in kmemcheck_read(). */ - kmemcheck_write_strict(regs, addr, next_page - addr); - kmemcheck_write_strict(regs, next_page, next_addr - next_page); -} - -/* - * Copying is hard. We have two addresses, each of which may be split across - * a page (and each page will have different shadow addresses). - */ -static void kmemcheck_copy(struct pt_regs *regs, - unsigned long src_addr, unsigned long dst_addr, unsigned int size) -{ - uint8_t shadow[8]; - enum kmemcheck_shadow status; - - unsigned long page; - unsigned long next_addr; - unsigned long next_page; - - uint8_t *x; - unsigned int i; - unsigned int n; - - BUG_ON(size > sizeof(shadow)); - - page = src_addr & PAGE_MASK; - next_addr = src_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < size; ++i) - shadow[i] = x[i]; - } else { - for (i = 0; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } else { - n = next_page - src_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(src_addr); - if (x) { - kmemcheck_save_addr(src_addr); - for (i = 0; i < n; ++i) - shadow[i] = x[i]; - } else { - /* Not tracked */ - for (i = 0; i < n; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) - shadow[i] = x[i - n]; - } else { - /* Not tracked */ - for (i = n; i < size; ++i) - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - page = dst_addr & PAGE_MASK; - next_addr = dst_addr + size - 1; - next_page = next_addr & PAGE_MASK; - - if (likely(page == next_page)) { - /* Same page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < size; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } else { - n = next_page - dst_addr; - BUG_ON(n > sizeof(shadow)); - - /* First page */ - x = kmemcheck_shadow_lookup(dst_addr); - if (x) { - kmemcheck_save_addr(dst_addr); - for (i = 0; i < n; ++i) { - x[i] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - - /* Second page */ - x = kmemcheck_shadow_lookup(next_page); - if (x) { - kmemcheck_save_addr(next_page); - for (i = n; i < size; ++i) { - x[i - n] = shadow[i]; - shadow[i] = KMEMCHECK_SHADOW_INITIALIZED; - } - } - } - - status = kmemcheck_shadow_test(shadow, size); - if (status == KMEMCHECK_SHADOW_INITIALIZED) - return; - - if (kmemcheck_enabled) - kmemcheck_error_save(status, src_addr, size, regs); - - if (kmemcheck_enabled == 2) - kmemcheck_enabled = 0; -} - -enum kmemcheck_method { - KMEMCHECK_READ, - KMEMCHECK_WRITE, -}; - -static void kmemcheck_access(struct pt_regs *regs, - unsigned long fallback_address, enum kmemcheck_method fallback_method) -{ - const uint8_t *insn; - const uint8_t *insn_primary; - unsigned int size; - - struct kmemcheck_context *data = this_cpu_ptr(&kmemcheck_context); - - /* Recursive fault -- ouch. */ - if (data->busy) { - kmemcheck_show_addr(fallback_address); - kmemcheck_error_save_bug(regs); - return; - } - - data->busy = true; - - insn = (const uint8_t *) regs->ip; - insn_primary = kmemcheck_opcode_get_primary(insn); - - kmemcheck_opcode_decode(insn, &size); - - switch (insn_primary[0]) { -#ifdef CONFIG_KMEMCHECK_BITOPS_OK - /* AND, OR, XOR */ - /* - * Unfortunately, these instructions have to be excluded from - * our regular checking since they access only some (and not - * all) bits. This clears out "bogus" bitfield-access warnings. - */ - case 0x80: - case 0x81: - case 0x82: - case 0x83: - switch ((insn_primary[1] >> 3) & 7) { - /* OR */ - case 1: - /* AND */ - case 4: - /* XOR */ - case 6: - kmemcheck_write(regs, fallback_address, size); - goto out; - - /* ADD */ - case 0: - /* ADC */ - case 2: - /* SBB */ - case 3: - /* SUB */ - case 5: - /* CMP */ - case 7: - break; - } - break; -#endif - - /* MOVS, MOVSB, MOVSW, MOVSD */ - case 0xa4: - case 0xa5: - /* - * These instructions are special because they take two - * addresses, but we only get one page fault. - */ - kmemcheck_copy(regs, regs->si, regs->di, size); - goto out; - - /* CMPS, CMPSB, CMPSW, CMPSD */ - case 0xa6: - case 0xa7: - kmemcheck_read(regs, regs->si, size); - kmemcheck_read(regs, regs->di, size); - goto out; - } - - /* - * If the opcode isn't special in any way, we use the data from the - * page fault handler to determine the address and type of memory - * access. - */ - switch (fallback_method) { - case KMEMCHECK_READ: - kmemcheck_read(regs, fallback_address, size); - goto out; - case KMEMCHECK_WRITE: - kmemcheck_write(regs, fallback_address, size); - goto out; - } - -out: - data->busy = false; -} - -bool kmemcheck_fault(struct pt_regs *regs, unsigned long address, - unsigned long error_code) -{ - pte_t *pte; - - /* - * XXX: Is it safe to assume that memory accesses from virtual 86 - * mode or non-kernel code segments will _never_ access kernel - * memory (e.g. tracked pages)? For now, we need this to avoid - * invoking kmemcheck for PnP BIOS calls. - */ - if (regs->flags & X86_VM_MASK) - return false; - if (regs->cs != __KERNEL_CS) - return false; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return false; - - WARN_ON_ONCE(in_nmi()); - - if (error_code & 2) - kmemcheck_access(regs, address, KMEMCHECK_WRITE); - else - kmemcheck_access(regs, address, KMEMCHECK_READ); - - kmemcheck_show(regs); - return true; -} - -bool kmemcheck_trap(struct pt_regs *regs) -{ - if (!kmemcheck_active(regs)) - return false; - - /* We're done. */ - kmemcheck_hide(regs); - return true; -} diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c index df8109ddf7fe..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/opcode.c +++ b/arch/x86/mm/kmemcheck/opcode.c @@ -1,107 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include - -#include "opcode.h" - -static bool opcode_is_prefix(uint8_t b) -{ - return - /* Group 1 */ - b == 0xf0 || b == 0xf2 || b == 0xf3 - /* Group 2 */ - || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 - || b == 0x64 || b == 0x65 - /* Group 3 */ - || b == 0x66 - /* Group 4 */ - || b == 0x67; -} - -#ifdef CONFIG_X86_64 -static bool opcode_is_rex_prefix(uint8_t b) -{ - return (b & 0xf0) == 0x40; -} -#else -static bool opcode_is_rex_prefix(uint8_t b) -{ - return false; -} -#endif - -#define REX_W (1 << 3) - -/* - * This is a VERY crude opcode decoder. We only need to find the size of the - * load/store that caused our #PF and this should work for all the opcodes - * that we care about. Moreover, the ones who invented this instruction set - * should be shot. - */ -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size) -{ - /* Default operand size */ - int operand_size_override = 4; - - /* prefixes */ - for (; opcode_is_prefix(*op); ++op) { - if (*op == 0x66) - operand_size_override = 2; - } - - /* REX prefix */ - if (opcode_is_rex_prefix(*op)) { - uint8_t rex = *op; - - ++op; - if (rex & REX_W) { - switch (*op) { - case 0x63: - *size = 4; - return; - case 0x0f: - ++op; - - switch (*op) { - case 0xb6: - case 0xbe: - *size = 1; - return; - case 0xb7: - case 0xbf: - *size = 2; - return; - } - - break; - } - - *size = 8; - return; - } - } - - /* escape opcode */ - if (*op == 0x0f) { - ++op; - - /* - * This is move with zero-extend and sign-extend, respectively; - * we don't have to think about 0xb6/0xbe, because this is - * already handled in the conditional below. - */ - if (*op == 0xb7 || *op == 0xbf) - operand_size_override = 2; - } - - *size = (*op & 1) ? operand_size_override : 1; -} - -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op) -{ - /* skip prefixes */ - while (opcode_is_prefix(*op)) - ++op; - if (opcode_is_rex_prefix(*op)) - ++op; - return op; -} diff --git a/arch/x86/mm/kmemcheck/opcode.h b/arch/x86/mm/kmemcheck/opcode.h index 51a1ce94c24a..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/opcode.h +++ b/arch/x86/mm/kmemcheck/opcode.h @@ -1,10 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__OPCODE_H -#define ARCH__X86__MM__KMEMCHECK__OPCODE_H - -#include - -void kmemcheck_opcode_decode(const uint8_t *op, unsigned int *size); -const uint8_t *kmemcheck_opcode_get_primary(const uint8_t *op); - -#endif diff --git a/arch/x86/mm/kmemcheck/pte.c b/arch/x86/mm/kmemcheck/pte.c index 8a03be90272a..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/pte.c +++ b/arch/x86/mm/kmemcheck/pte.c @@ -1,23 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include - -#include - -#include "pte.h" - -pte_t *kmemcheck_pte_lookup(unsigned long address) -{ - pte_t *pte; - unsigned int level; - - pte = lookup_address(address, &level); - if (!pte) - return NULL; - if (level != PG_LEVEL_4K) - return NULL; - if (!pte_hidden(*pte)) - return NULL; - - return pte; -} - diff --git a/arch/x86/mm/kmemcheck/pte.h b/arch/x86/mm/kmemcheck/pte.h index b595612382c2..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/pte.h +++ b/arch/x86/mm/kmemcheck/pte.h @@ -1,11 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__PTE_H -#define ARCH__X86__MM__KMEMCHECK__PTE_H - -#include - -#include - -pte_t *kmemcheck_pte_lookup(unsigned long address); - -#endif diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 7ce0be1f99eb..cec594032515 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,71 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include - -#include "opcode.h" -#include "selftest.h" - -struct selftest_opcode { - unsigned int expected_size; - const uint8_t *insn; - const char *desc; -}; - -static const struct selftest_opcode selftest_opcodes[] = { - /* REP MOVS */ - {1, "\xf3\xa4", "rep movsb , "}, - {4, "\xf3\xa5", "rep movsl , "}, - - /* MOVZX / MOVZXD */ - {1, "\x66\x0f\xb6\x51\xf8", "movzwq , "}, - {1, "\x0f\xb6\x51\xf8", "movzwq , "}, - - /* MOVSX / MOVSXD */ - {1, "\x66\x0f\xbe\x51\xf8", "movswq , "}, - {1, "\x0f\xbe\x51\xf8", "movswq , "}, - -#ifdef CONFIG_X86_64 - /* MOVZX / MOVZXD */ - {1, "\x49\x0f\xb6\x51\xf8", "movzbq , "}, - {2, "\x49\x0f\xb7\x51\xf8", "movzbq , "}, - - /* MOVSX / MOVSXD */ - {1, "\x49\x0f\xbe\x51\xf8", "movsbq , "}, - {2, "\x49\x0f\xbf\x51\xf8", "movsbq , "}, - {4, "\x49\x63\x51\xf8", "movslq , "}, -#endif -}; - -static bool selftest_opcode_one(const struct selftest_opcode *op) -{ - unsigned size; - - kmemcheck_opcode_decode(op->insn, &size); - - if (size == op->expected_size) - return true; - - printk(KERN_WARNING "kmemcheck: opcode %s: expected size %d, got %d\n", - op->desc, op->expected_size, size); - return false; -} - -static bool selftest_opcodes_all(void) -{ - bool pass = true; - unsigned int i; - - for (i = 0; i < ARRAY_SIZE(selftest_opcodes); ++i) - pass = pass && selftest_opcode_one(&selftest_opcodes[i]); - - return pass; -} - -bool kmemcheck_selftest(void) -{ - bool pass = true; - - pass = pass && selftest_opcodes_all(); - - return pass; -} diff --git a/arch/x86/mm/kmemcheck/selftest.h b/arch/x86/mm/kmemcheck/selftest.h index 8d759aae453d..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/selftest.h +++ b/arch/x86/mm/kmemcheck/selftest.h @@ -1,7 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH_X86_MM_KMEMCHECK_SELFTEST_H -#define ARCH_X86_MM_KMEMCHECK_SELFTEST_H - -bool kmemcheck_selftest(void); - -#endif diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c deleted file mode 100644 index c2638a7d2c10..000000000000 --- a/arch/x86/mm/kmemcheck/shadow.c +++ /dev/null @@ -1,173 +0,0 @@ -#include -#include -#include - -#include -#include - -#include "pte.h" -#include "shadow.h" - -/* - * Return the shadow address for the given address. Returns NULL if the - * address is not tracked. - * - * We need to be extremely careful not to follow any invalid pointers, - * because this function can be called for *any* possible address. - */ -void *kmemcheck_shadow_lookup(unsigned long address) -{ - pte_t *pte; - struct page *page; - - if (!virt_addr_valid(address)) - return NULL; - - pte = kmemcheck_pte_lookup(address); - if (!pte) - return NULL; - - page = virt_to_page(address); - if (!page->shadow) - return NULL; - return page->shadow + (address & (PAGE_SIZE - 1)); -} - -static void mark_shadow(void *address, unsigned int n, - enum kmemcheck_shadow status) -{ - unsigned long addr = (unsigned long) address; - unsigned long last_addr = addr + n - 1; - unsigned long page = addr & PAGE_MASK; - unsigned long last_page = last_addr & PAGE_MASK; - unsigned int first_n; - void *shadow; - - /* If the memory range crosses a page boundary, stop there. */ - if (page == last_page) - first_n = n; - else - first_n = page + PAGE_SIZE - addr; - - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, first_n); - - addr += first_n; - n -= first_n; - - /* Do full-page memset()s. */ - while (n >= PAGE_SIZE) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, PAGE_SIZE); - - addr += PAGE_SIZE; - n -= PAGE_SIZE; - } - - /* Do the remaining page, if any. */ - if (n > 0) { - shadow = kmemcheck_shadow_lookup(addr); - if (shadow) - memset(shadow, status, n); - } -} - -void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNALLOCATED); -} - -void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_UNINITIALIZED); -} - -/* - * Fill the shadow memory of the given address such that the memory at that - * address is marked as being initialized. - */ -void kmemcheck_mark_initialized(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_INITIALIZED); -} -EXPORT_SYMBOL_GPL(kmemcheck_mark_initialized); - -void kmemcheck_mark_freed(void *address, unsigned int n) -{ - mark_shadow(address, n, KMEMCHECK_SHADOW_FREED); -} - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_unallocated(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_uninitialized(page_address(&p[i]), PAGE_SIZE); -} - -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n) -{ - unsigned int i; - - for (i = 0; i < n; ++i) - kmemcheck_mark_initialized(page_address(&p[i]), PAGE_SIZE); -} - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) -{ -#ifdef CONFIG_KMEMCHECK_PARTIAL_OK - uint8_t *x; - unsigned int i; - - x = shadow; - - /* - * Make sure _some_ bytes are initialized. Gcc frequently generates - * code to access neighboring bytes. - */ - for (i = 0; i < size; ++i) { - if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -#else - return kmemcheck_shadow_test_all(shadow, size); -#endif -} - -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - - /* All bytes must be initialized. */ - for (i = 0; i < size; ++i) { - if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) - return x[i]; - } - - return x[0]; -} - -void kmemcheck_shadow_set(void *shadow, unsigned int size) -{ - uint8_t *x; - unsigned int i; - - x = shadow; - for (i = 0; i < size; ++i) - x[i] = KMEMCHECK_SHADOW_INITIALIZED; -} diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h index 49768dc18664..ea32a7d3cf1b 100644 --- a/arch/x86/mm/kmemcheck/shadow.h +++ b/arch/x86/mm/kmemcheck/shadow.h @@ -1,19 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef ARCH__X86__MM__KMEMCHECK__SHADOW_H -#define ARCH__X86__MM__KMEMCHECK__SHADOW_H - -enum kmemcheck_shadow { - KMEMCHECK_SHADOW_UNALLOCATED, - KMEMCHECK_SHADOW_UNINITIALIZED, - KMEMCHECK_SHADOW_INITIALIZED, - KMEMCHECK_SHADOW_FREED, -}; - -void *kmemcheck_shadow_lookup(unsigned long address); - -enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); -enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, - unsigned int size); -void kmemcheck_shadow_set(void *shadow, unsigned int size); - -#endif diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index baeb872283d9..69c238210325 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -594,21 +594,6 @@ static inline void tasklet_hi_schedule(struct tasklet_struct *t) __tasklet_hi_schedule(t); } -extern void __tasklet_hi_schedule_first(struct tasklet_struct *t); - -/* - * This version avoids touching any other tasklets. Needed for kmemcheck - * in order not to take any page faults while enqueueing this tasklet; - * consider VERY carefully whether you really need this or - * tasklet_hi_schedule()... - */ -static inline void tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) - __tasklet_hi_schedule_first(t); -} - - static inline void tasklet_disable_nosync(struct tasklet_struct *t) { atomic_inc(&t->count); diff --git a/include/linux/kmemcheck.h b/include/linux/kmemcheck.h index 7b1d7bead7d9..ea32a7d3cf1b 100644 --- a/include/linux/kmemcheck.h +++ b/include/linux/kmemcheck.h @@ -1,172 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef LINUX_KMEMCHECK_H -#define LINUX_KMEMCHECK_H - -#include -#include - -#ifdef CONFIG_KMEMCHECK -extern int kmemcheck_enabled; - -/* The slab-related functions. */ -void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node); -void kmemcheck_free_shadow(struct page *page, int order); -void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size); -void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size); - -void kmemcheck_pagealloc_alloc(struct page *p, unsigned int order, - gfp_t gfpflags); - -void kmemcheck_show_pages(struct page *p, unsigned int n); -void kmemcheck_hide_pages(struct page *p, unsigned int n); - -bool kmemcheck_page_is_tracked(struct page *p); - -void kmemcheck_mark_unallocated(void *address, unsigned int n); -void kmemcheck_mark_uninitialized(void *address, unsigned int n); -void kmemcheck_mark_initialized(void *address, unsigned int n); -void kmemcheck_mark_freed(void *address, unsigned int n); - -void kmemcheck_mark_unallocated_pages(struct page *p, unsigned int n); -void kmemcheck_mark_uninitialized_pages(struct page *p, unsigned int n); -void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n); - -int kmemcheck_show_addr(unsigned long address); -int kmemcheck_hide_addr(unsigned long address); - -bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size); - -/* - * Bitfield annotations - * - * How to use: If you have a struct using bitfields, for example - * - * struct a { - * int x:8, y:8; - * }; - * - * then this should be rewritten as - * - * struct a { - * kmemcheck_bitfield_begin(flags); - * int x:8, y:8; - * kmemcheck_bitfield_end(flags); - * }; - * - * Now the "flags_begin" and "flags_end" members may be used to refer to the - * beginning and end, respectively, of the bitfield (and things like - * &x.flags_begin is allowed). As soon as the struct is allocated, the bit- - * fields should be annotated: - * - * struct a *a = kmalloc(sizeof(struct a), GFP_KERNEL); - * kmemcheck_annotate_bitfield(a, flags); - */ -#define kmemcheck_bitfield_begin(name) \ - int name##_begin[0]; - -#define kmemcheck_bitfield_end(name) \ - int name##_end[0]; - -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - int _n; \ - \ - if (!ptr) \ - break; \ - \ - _n = (long) &((ptr)->name##_end) \ - - (long) &((ptr)->name##_begin); \ - BUILD_BUG_ON(_n < 0); \ - \ - kmemcheck_mark_initialized(&((ptr)->name##_begin), _n); \ - } while (0) - -#define kmemcheck_annotate_variable(var) \ - do { \ - kmemcheck_mark_initialized(&(var), sizeof(var)); \ - } while (0) \ - -#else -#define kmemcheck_enabled 0 - -static inline void -kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) -{ -} - -static inline void -kmemcheck_free_shadow(struct page *page, int order) -{ -} - -static inline void -kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size) -{ -} - -static inline void kmemcheck_slab_free(struct kmem_cache *s, void *object, - size_t size) -{ -} - -static inline void kmemcheck_pagealloc_alloc(struct page *p, - unsigned int order, gfp_t gfpflags) -{ -} - -static inline bool kmemcheck_page_is_tracked(struct page *p) -{ - return false; -} - -static inline void kmemcheck_mark_unallocated(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_uninitialized(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_initialized(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_freed(void *address, unsigned int n) -{ -} - -static inline void kmemcheck_mark_unallocated_pages(struct page *p, - unsigned int n) -{ -} - -static inline void kmemcheck_mark_uninitialized_pages(struct page *p, - unsigned int n) -{ -} - -static inline void kmemcheck_mark_initialized_pages(struct page *p, - unsigned int n) -{ -} - -static inline bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size) -{ - return true; -} - -#define kmemcheck_bitfield_begin(name) -#define kmemcheck_bitfield_end(name) -#define kmemcheck_annotate_bitfield(ptr, name) \ - do { \ - } while (0) - -#define kmemcheck_annotate_variable(var) \ - do { \ - } while (0) - -#endif /* CONFIG_KMEMCHECK */ - -#endif /* LINUX_KMEMCHECK_H */ diff --git a/kernel/softirq.c b/kernel/softirq.c index 662f7b1b7a78..2f5e87f1bae2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -486,16 +486,6 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); -void __tasklet_hi_schedule_first(struct tasklet_struct *t) -{ - lockdep_assert_irqs_disabled(); - - t->next = __this_cpu_read(tasklet_hi_vec.head); - __this_cpu_write(tasklet_hi_vec.head, t); - __raise_softirq_irqoff(HI_SOFTIRQ); -} -EXPORT_SYMBOL(__tasklet_hi_schedule_first); - static __latent_entropy void tasklet_action(struct softirq_action *a) { struct tasklet_struct *list; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9576bd582d4a..7638e2f7fff8 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -30,7 +30,6 @@ #include #include #include -#include #include #include #include @@ -1173,15 +1172,6 @@ static struct ctl_table kern_table[] = { .extra1 = &zero, .extra2 = &one_thousand, }, -#endif -#ifdef CONFIG_KMEMCHECK - { - .procname = "kmemcheck", - .data = &kmemcheck_enabled, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #endif { .procname = "panic_on_warn", diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 07ce7449765a..5402e3954659 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -504,7 +504,7 @@ config DEBUG_OBJECTS_ENABLE_DEFAULT config DEBUG_SLAB bool "Debug slab memory allocations" - depends on DEBUG_KERNEL && SLAB && !KMEMCHECK + depends on DEBUG_KERNEL && SLAB help Say Y here to have the kernel do limited verification on memory allocation as well as poisoning memory on free to catch use of freed @@ -516,7 +516,7 @@ config DEBUG_SLAB_LEAK config SLUB_DEBUG_ON bool "SLUB debugging on by default" - depends on SLUB && SLUB_DEBUG && !KMEMCHECK + depends on SLUB && SLUB_DEBUG default n help Boot with debugging on by default. SLUB boots by default with @@ -730,8 +730,6 @@ config DEBUG_STACKOVERFLOW If in doubt, say "N". -source "lib/Kconfig.kmemcheck" - source "lib/Kconfig.kasan" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kmemcheck b/lib/Kconfig.kmemcheck deleted file mode 100644 index 846e039a86b4..000000000000 --- a/lib/Kconfig.kmemcheck +++ /dev/null @@ -1,94 +0,0 @@ -config HAVE_ARCH_KMEMCHECK - bool - -if HAVE_ARCH_KMEMCHECK - -menuconfig KMEMCHECK - bool "kmemcheck: trap use of uninitialized memory" - depends on DEBUG_KERNEL - depends on !X86_USE_3DNOW - depends on SLUB || SLAB - depends on !CC_OPTIMIZE_FOR_SIZE - depends on !FUNCTION_TRACER - select FRAME_POINTER - select STACKTRACE - default n - help - This option enables tracing of dynamically allocated kernel memory - to see if memory is used before it has been given an initial value. - Be aware that this requires half of your memory for bookkeeping and - will insert extra code at *every* read and write to tracked memory - thus slow down the kernel code (but user code is unaffected). - - The kernel may be started with kmemcheck=0 or kmemcheck=1 to disable - or enable kmemcheck at boot-time. If the kernel is started with - kmemcheck=0, the large memory and CPU overhead is not incurred. - -choice - prompt "kmemcheck: default mode at boot" - depends on KMEMCHECK - default KMEMCHECK_ONESHOT_BY_DEFAULT - help - This option controls the default behaviour of kmemcheck when the - kernel boots and no kmemcheck= parameter is given. - -config KMEMCHECK_DISABLED_BY_DEFAULT - bool "disabled" - depends on KMEMCHECK - -config KMEMCHECK_ENABLED_BY_DEFAULT - bool "enabled" - depends on KMEMCHECK - -config KMEMCHECK_ONESHOT_BY_DEFAULT - bool "one-shot" - depends on KMEMCHECK - help - In one-shot mode, only the first error detected is reported before - kmemcheck is disabled. - -endchoice - -config KMEMCHECK_QUEUE_SIZE - int "kmemcheck: error queue size" - depends on KMEMCHECK - default 64 - help - Select the maximum number of errors to store in the queue. Since - errors can occur virtually anywhere and in any context, we need a - temporary storage area which is guarantueed not to generate any - other faults. The queue will be emptied as soon as a tasklet may - be scheduled. If the queue is full, new error reports will be - lost. - -config KMEMCHECK_SHADOW_COPY_SHIFT - int "kmemcheck: shadow copy size (5 => 32 bytes, 6 => 64 bytes)" - depends on KMEMCHECK - range 2 8 - default 5 - help - Select the number of shadow bytes to save along with each entry of - the queue. These bytes indicate what parts of an allocation are - initialized, uninitialized, etc. and will be displayed when an - error is detected to help the debugging of a particular problem. - -config KMEMCHECK_PARTIAL_OK - bool "kmemcheck: allow partially uninitialized memory" - depends on KMEMCHECK - default y - help - This option works around certain GCC optimizations that produce - 32-bit reads from 16-bit variables where the upper 16 bits are - thrown away afterwards. This may of course also hide some real - bugs. - -config KMEMCHECK_BITOPS_OK - bool "kmemcheck: allow bit-field manipulation" - depends on KMEMCHECK - default n - help - This option silences warnings that would be generated for bit-field - accesses where not all the bits are initialized at the same time. - This may also hide some real bugs. - -endif diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 5b0adf1435de..e5e606ee5f71 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -11,7 +11,6 @@ config DEBUG_PAGEALLOC bool "Debug page memory allocations" depends on DEBUG_KERNEL depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC - depends on !KMEMCHECK select PAGE_EXTENSION select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC ---help--- diff --git a/mm/Makefile b/mm/Makefile index 4659b93cba43..e7ebd176fb93 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -17,7 +17,6 @@ KCOV_INSTRUMENT_slub.o := n KCOV_INSTRUMENT_page_alloc.o := n KCOV_INSTRUMENT_debug-pagealloc.o := n KCOV_INSTRUMENT_kmemleak.o := n -KCOV_INSTRUMENT_kmemcheck.o := n KCOV_INSTRUMENT_memcontrol.o := n KCOV_INSTRUMENT_mmzone.o := n KCOV_INSTRUMENT_vmstat.o := n @@ -70,7 +69,6 @@ obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += page_poison.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o -obj-$(CONFIG_KMEMCHECK) += kmemcheck.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o diff --git a/mm/kmemcheck.c b/mm/kmemcheck.c index b3a4d61d341c..cec594032515 100644 --- a/mm/kmemcheck.c +++ b/mm/kmemcheck.c @@ -1,126 +1 @@ // SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include "slab.h" -#include - -void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node) -{ - struct page *shadow; - int pages; - int i; - - pages = 1 << order; - - /* - * With kmemcheck enabled, we need to allocate a memory area for the - * shadow bits as well. - */ - shadow = alloc_pages_node(node, flags, order); - if (!shadow) { - if (printk_ratelimit()) - pr_err("kmemcheck: failed to allocate shadow bitmap\n"); - return; - } - - for(i = 0; i < pages; ++i) - page[i].shadow = page_address(&shadow[i]); - - /* - * Mark it as non-present for the MMU so that our accesses to - * this memory will trigger a page fault and let us analyze - * the memory accesses. - */ - kmemcheck_hide_pages(page, pages); -} - -void kmemcheck_free_shadow(struct page *page, int order) -{ - struct page *shadow; - int pages; - int i; - - if (!kmemcheck_page_is_tracked(page)) - return; - - pages = 1 << order; - - kmemcheck_show_pages(page, pages); - - shadow = virt_to_page(page[0].shadow); - - for(i = 0; i < pages; ++i) - page[i].shadow = NULL; - - __free_pages(shadow, order); -} - -void kmemcheck_slab_alloc(struct kmem_cache *s, gfp_t gfpflags, void *object, - size_t size) -{ - if (unlikely(!object)) /* Skip object if allocation failed */ - return; - - /* - * Has already been memset(), which initializes the shadow for us - * as well. - */ - if (gfpflags & __GFP_ZERO) - return; - - /* No need to initialize the shadow of a non-tracked slab. */ - if (s->flags & SLAB_NOTRACK) - return; - - if (!kmemcheck_enabled || gfpflags & __GFP_NOTRACK) { - /* - * Allow notracked objects to be allocated from - * tracked caches. Note however that these objects - * will still get page faults on access, they just - * won't ever be flagged as uninitialized. If page - * faults are not acceptable, the slab cache itself - * should be marked NOTRACK. - */ - kmemcheck_mark_initialized(object, size); - } else if (!s->ctor) { - /* - * New objects should be marked uninitialized before - * they're returned to the called. - */ - kmemcheck_mark_uninitialized(object, size); - } -} - -void kmemcheck_slab_free(struct kmem_cache *s, void *object, size_t size) -{ - /* TODO: RCU freeing is unsupported for now; hide false positives. */ - if (!s->ctor && !(s->flags & SLAB_TYPESAFE_BY_RCU)) - kmemcheck_mark_freed(object, size); -} - -void kmemcheck_pagealloc_alloc(struct page *page, unsigned int order, - gfp_t gfpflags) -{ - int pages; - - if (gfpflags & (__GFP_HIGHMEM | __GFP_NOTRACK)) - return; - - pages = 1 << order; - - /* - * NOTE: We choose to track GFP_ZERO pages too; in fact, they - * can become uninitialized by copying uninitialized memory - * into them. - */ - - /* XXX: Can use zone->node for node? */ - kmemcheck_alloc_shadow(page, order, gfpflags, -1); - - if (gfpflags & __GFP_ZERO) - kmemcheck_mark_initialized_pages(page, pages); - else - kmemcheck_mark_uninitialized_pages(page, pages); -} diff --git a/mm/slub.c b/mm/slub.c index c2c41e178acf..cfd56e5a35fb 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1371,7 +1371,7 @@ static inline void *slab_free_hook(struct kmem_cache *s, void *x) * So in order to make the debug calls that expect irqs to be * disabled we need to disable interrupts temporarily. */ -#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP) +#ifdef CONFIG_LOCKDEP { unsigned long flags; @@ -1399,8 +1399,7 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s, * Compiler cannot detect this function can be removed if slab_free_hook() * evaluates to nothing. Thus, catch all relevant config debug options here. */ -#if defined(CONFIG_KMEMCHECK) || \ - defined(CONFIG_LOCKDEP) || \ +#if defined(CONFIG_LOCKDEP) || \ defined(CONFIG_DEBUG_KMEMLEAK) || \ defined(CONFIG_DEBUG_OBJECTS_FREE) || \ defined(CONFIG_KASAN) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 67d051edd615..7bd52b8f63d4 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -2182,8 +2182,6 @@ sub dump_struct($$) { # strip comments: $members =~ s/\/\*.*?\*\///gos; $nested =~ s/\/\*.*?\*\///gos; - # strip kmemcheck_bitfield_{begin,end}.*; - $members =~ s/kmemcheck_bitfield_.*?;//gos; # strip attributes $members =~ s/__attribute__\s*\(\([a-z,_\*\s\(\)]*\)\)//i; $members =~ s/__aligned\s*\([^;]*\)//gos; diff --git a/tools/include/linux/kmemcheck.h b/tools/include/linux/kmemcheck.h index 2bccd2c7b897..ea32a7d3cf1b 100644 --- a/tools/include/linux/kmemcheck.h +++ b/tools/include/linux/kmemcheck.h @@ -1,9 +1 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LIBLOCKDEP_LINUX_KMEMCHECK_H_ -#define _LIBLOCKDEP_LINUX_KMEMCHECK_H_ - -static inline void kmemcheck_mark_initialized(void *address, unsigned int n) -{ -} - -#endif From 783cb68ee2d25d621326366c0b615bf2ccf3b402 Mon Sep 17 00:00:00 2001 From: Changbin Du Date: Wed, 15 Nov 2017 17:36:06 -0800 Subject: [PATCH 080/131] mm/swap_state.c: declare a few variables as __read_mostly These global variables are only set during initialization or rarely change, so declare them as __read_mostly. Link: http://lkml.kernel.org/r/1507802349-5554-1-git-send-email-changbin.du@intel.com Signed-off-by: Changbin Du Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap_state.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index f2face8b889e..374d446f7a0a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -36,9 +36,9 @@ static const struct address_space_operations swap_aops = { #endif }; -struct address_space *swapper_spaces[MAX_SWAPFILES]; -static unsigned int nr_swapper_spaces[MAX_SWAPFILES]; -bool swap_vma_readahead = true; +struct address_space *swapper_spaces[MAX_SWAPFILES] __read_mostly; +static unsigned int nr_swapper_spaces[MAX_SWAPFILES] __read_mostly; +bool swap_vma_readahead __read_mostly = true; #define SWAP_RA_WIN_SHIFT (PAGE_SHIFT / 2) #define SWAP_RA_HITS_MASK ((1UL << SWAP_RA_WIN_SHIFT) - 1) From 2f47a91f4dab19aaaa05cdcfced9dfcaf3f5257e Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:09 -0800 Subject: [PATCH 081/131] mm: deferred_init_memmap improvements Patch series "complete deferred page initialization", v12. SMP machines can benefit from the DEFERRED_STRUCT_PAGE_INIT config option, which defers initializing struct pages until all cpus have been started so it can be done in parallel. However, this feature is sub-optimal, because the deferred page initialization code expects that the struct pages have already been zeroed, and the zeroing is done early in boot with a single thread only. Also, we access that memory and set flags before struct pages are initialized. All of this is fixed in this patchset. In this work we do the following: - Never read access struct page until it was initialized - Never set any fields in struct pages before they are initialized - Zero struct page at the beginning of struct page initialization ========================================================================== Performance improvements on x86 machine with 8 nodes: Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz and 1T of memory: TIME SPEED UP base no deferred: 95.796233s fix no deferred: 79.978956s 19.77% base deferred: 77.254713s fix deferred: 55.050509s 40.34% ========================================================================== SPARC M6 3600 MHz with 15T of memory TIME SPEED UP base no deferred: 358.335727s fix no deferred: 302.320936s 18.52% base deferred: 237.534603s fix deferred: 182.103003s 30.44% ========================================================================== Raw dmesg output with timestamps: x86 base no deferred: https://hastebin.com/ofunepurit.scala x86 base deferred: https://hastebin.com/ifazegeyas.scala x86 fix no deferred: https://hastebin.com/pegocohevo.scala x86 fix deferred: https://hastebin.com/ofupevikuk.scala sparc base no deferred: https://hastebin.com/ibobeteken.go sparc base deferred: https://hastebin.com/fariqimiyu.go sparc fix no deferred: https://hastebin.com/muhegoheyi.go sparc fix deferred: https://hastebin.com/xadinobutu.go This patch (of 11): deferred_init_memmap() is called when struct pages are initialized later in boot by slave CPUs. This patch simplifies and optimizes this function, and also fixes a couple issues (described below). The main change is that now we are iterating through free memblock areas instead of all configured memory. Thus, we do not have to check if the struct page has already been initialized. ===== In deferred_init_memmap() where all deferred struct pages are initialized we have a check like this: if (page->flags) { VM_BUG_ON(page_zone(page) != zone); goto free_range; } This way we are checking if the current deferred page has already been initialized. It works, because memory for struct pages has been zeroed, and the only way flags are not zero if it went through __init_single_page() before. But, once we change the current behavior and won't zero the memory in memblock allocator, we cannot trust anything inside "struct page"es until they are initialized. This patch fixes this. The deferred_init_memmap() is re-written to loop through only free memory ranges provided by memblock. Note, this first issue is relevant only when the following change is merged: ===== This patch fixes another existing issue on systems that have holes in zones i.e CONFIG_HOLES_IN_ZONE is defined. In for_each_mem_pfn_range() we have code like this: if (!pfn_valid_within(pfn) goto free_range; Note: 'page' is not set to NULL and is not incremented but 'pfn' advances. Thus means if deferred struct pages are enabled on systems with these kind of holes, linux would get memory corruptions. I have fixed this issue by defining a new macro that performs all the necessary operations when we free the current set of pages. [pasha.tatashin@oracle.com: buddy page accessed before initialized] Link: http://lkml.kernel.org/r/20171102170221.7401-2-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20171013173214.27300-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Christian Borntraeger Cc: Heiko Carstens Cc: David S. Miller Cc: Matthew Wilcox Cc: Michal Hocko Cc: Ard Biesheuvel Cc: Mark Rutland Cc: Will Deacon Cc: Catalin Marinas Cc: Sam Ravnborg Cc: Mel Gorman Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 188 +++++++++++++++++++++++++++--------------------- 1 file changed, 105 insertions(+), 83 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30a464b47366..4dee5082d3d7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1408,14 +1408,17 @@ void clear_zone_contiguous(struct zone *zone) } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT -static void __init deferred_free_range(struct page *page, - unsigned long pfn, int nr_pages) +static void __init deferred_free_range(unsigned long pfn, + unsigned long nr_pages) { - int i; + struct page *page; + unsigned long i; - if (!page) + if (!nr_pages) return; + page = pfn_to_page(pfn); + /* Free a large naturally-aligned chunk if possible */ if (nr_pages == pageblock_nr_pages && (pfn & (pageblock_nr_pages - 1)) == 0) { @@ -1441,19 +1444,109 @@ static inline void __init pgdat_init_report_one_done(void) complete(&pgdat_init_all_done_comp); } +/* + * Helper for deferred_init_range, free the given range, reset the counters, and + * return number of pages freed. + */ +static inline unsigned long __init __def_free(unsigned long *nr_free, + unsigned long *free_base_pfn, + struct page **page) +{ + unsigned long nr = *nr_free; + + deferred_free_range(*free_base_pfn, nr); + *free_base_pfn = 0; + *nr_free = 0; + *page = NULL; + + return nr; +} + +static unsigned long __init deferred_init_range(int nid, int zid, + unsigned long start_pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long free_base_pfn = 0; + unsigned long nr_pages = 0; + unsigned long nr_free = 0; + struct page *page = NULL; + unsigned long pfn; + + /* + * First we check if pfn is valid on architectures where it is possible + * to have holes within pageblock_nr_pages. On systems where it is not + * possible, this function is optimized out. + * + * Then, we check if a current large page is valid by only checking the + * validity of the head pfn. + * + * meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not + * belong to this memory node. + * + * Finally, we minimize pfn page lookups and scheduler checks by + * performing it only once every pageblock_nr_pages. + * + * We do it in two loops: first we initialize struct page, than free to + * buddy allocator, becuse while we are freeing pages we can access + * pages that are ahead (computing buddy page in __free_one_page()). + */ + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) + continue; + if ((pfn & nr_pgmask) || pfn_valid(pfn)) { + if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + if (page && (pfn & nr_pgmask)) + page++; + else + page = pfn_to_page(pfn); + __init_single_page(page, pfn, zid, nid); + cond_resched(); + } + } + } + + page = NULL; + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + if (!pfn_valid_within(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + } else if (page && (pfn & nr_pgmask)) { + page++; + nr_free++; + } else { + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + page = pfn_to_page(pfn); + free_base_pfn = pfn; + nr_free = 1; + cond_resched(); + } + } + /* Free the last block of pages to allocator */ + nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + + return nr_pages; +} + /* Initialise remaining memory on a node */ static int __init deferred_init_memmap(void *data) { pg_data_t *pgdat = data; int nid = pgdat->node_id; - struct mminit_pfnnid_cache nid_init_state = { }; unsigned long start = jiffies; unsigned long nr_pages = 0; - unsigned long walk_start, walk_end; - int i, zid; + unsigned long spfn, epfn; + phys_addr_t spa, epa; + int zid; struct zone *zone; unsigned long first_init_pfn = pgdat->first_deferred_pfn; const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + u64 i; if (first_init_pfn == ULONG_MAX) { pgdat_init_report_one_done(); @@ -1475,83 +1568,12 @@ static int __init deferred_init_memmap(void *data) if (first_init_pfn < zone_end_pfn(zone)) break; } + first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); - for_each_mem_pfn_range(i, nid, &walk_start, &walk_end, NULL) { - unsigned long pfn, end_pfn; - struct page *page = NULL; - struct page *free_base_page = NULL; - unsigned long free_base_pfn = 0; - int nr_to_free = 0; - - end_pfn = min(walk_end, zone_end_pfn(zone)); - pfn = first_init_pfn; - if (pfn < walk_start) - pfn = walk_start; - if (pfn < zone->zone_start_pfn) - pfn = zone->zone_start_pfn; - - for (; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - goto free_range; - - /* - * Ensure pfn_valid is checked every - * pageblock_nr_pages for memory holes - */ - if ((pfn & (pageblock_nr_pages - 1)) == 0) { - if (!pfn_valid(pfn)) { - page = NULL; - goto free_range; - } - } - - if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - page = NULL; - goto free_range; - } - - /* Minimise pfn page lookups and scheduler checks */ - if (page && (pfn & (pageblock_nr_pages - 1)) != 0) { - page++; - } else { - nr_pages += nr_to_free; - deferred_free_range(free_base_page, - free_base_pfn, nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - - page = pfn_to_page(pfn); - cond_resched(); - } - - if (page->flags) { - VM_BUG_ON(page_zone(page) != zone); - goto free_range; - } - - __init_single_page(page, pfn, zid, nid); - if (!free_base_page) { - free_base_page = page; - free_base_pfn = pfn; - nr_to_free = 0; - } - nr_to_free++; - - /* Where possible, batch up pages for a single free */ - continue; -free_range: - /* Free the current block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, - nr_to_free); - free_base_page = NULL; - free_base_pfn = nr_to_free = 0; - } - /* Free the last block of pages to allocator */ - nr_pages += nr_to_free; - deferred_free_range(free_base_page, free_base_pfn, nr_to_free); - - first_init_pfn = max(end_pfn, first_init_pfn); + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + nr_pages += deferred_init_range(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ From 353b1e7b5859e98860f984d8894fa7ddc242a90e Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:14 -0800 Subject: [PATCH 082/131] x86/mm: set fields in deferred pages Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT), flags and other fields in "struct page"es are never changed prior to first initializing struct pages by going through __init_single_page(). With deferred struct page feature enabled, however, we set fields in register_page_bootmem_info that are subsequently clobbered right after in free_all_bootmem: mem_init() { register_page_bootmem_info(); free_all_bootmem(); ... } When register_page_bootmem_info() is called only non-deferred struct pages are initialized. But, this function goes through some reserved pages which might be part of the deferred, and thus are not yet initialized. mem_init register_page_bootmem_info register_page_bootmem_info_node get_page_bootmem .. setting fields here .. such as: page->freelist = (void *)type; free_all_bootmem() free_low_memory_core_early() for_each_reserved_mem_region() reserve_bootmem_region() init_reserved_page() <- Only if this is deferred reserved page __init_single_pfn() __init_single_page() memset(0) <-- Loose the set fields here We end up with issue where, currently we do not observe problem as memory is explicitly zeroed. But, if flag asserts are changed we can start hitting issues. Also, because in this patch series we will stop zeroing struct page memory during allocation, we must make sure that struct pages are properly initialized prior to using them. The deferred-reserved pages are initialized in free_all_bootmem(). Therefore, the fix is to switch the above calls. Link: http://lkml.kernel.org/r/20171013173214.27300-3-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5fa3a58b5d78..c3fc544b50d2 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1173,12 +1173,18 @@ void __init mem_init(void) /* clear_bss() already clear the empty_zero_page */ - register_page_bootmem_info(); - /* this will put all memory onto the freelists */ free_all_bootmem(); after_bootmem = 1; + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + /* Register memory areas for /proc/kcore */ kclist_add(&kcore_vsyscall, (void *)VSYSCALL_ADDR, PAGE_SIZE, KCORE_OTHER); From 2a20aa171071a334d80c4e5d5af719d8374702fc Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:18 -0800 Subject: [PATCH 083/131] sparc64/mm: set fields in deferred pages Without deferred struct page feature (CONFIG_DEFERRED_STRUCT_PAGE_INIT), flags and other fields in "struct page"es are never changed prior to first initializing struct pages by going through __init_single_page(). With deferred struct page feature enabled there is a case where we set some fields prior to initializing: mem_init() { register_page_bootmem_info(); free_all_bootmem(); ... } When register_page_bootmem_info() is called only non-deferred struct pages are initialized. But, this function goes through some reserved pages which might be part of the deferred, and thus are not yet initialized. mem_init register_page_bootmem_info register_page_bootmem_info_node get_page_bootmem .. setting fields here .. such as: page->freelist = (void *)type; free_all_bootmem() free_low_memory_core_early() for_each_reserved_mem_region() reserve_bootmem_region() init_reserved_page() <- Only if this is deferred reserved page __init_single_pfn() __init_single_page() memset(0) <-- Loose the set fields here We end up with similar issue as in the previous patch, where currently we do not observe problem as memory is zeroed. But, if flag asserts are changed we can start hitting issues. Also, because in this patch series we will stop zeroing struct page memory during allocation, we must make sure that struct pages are properly initialized prior to using them. The deferred-reserved pages are initialized in free_all_bootmem(). Therefore, the fix is to switch the above calls. Link: http://lkml.kernel.org/r/20171013173214.27300-4-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Acked-by: David S. Miller Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/init_64.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 2de22d703076..984e9d65ea0d 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2540,9 +2540,16 @@ void __init mem_init(void) { high_memory = __va(last_valid_pfn << PAGE_SHIFT); - register_page_bootmem_info(); free_all_bootmem(); + /* + * Must be done after boot memory is put on freelist, because here we + * might set fields in deferred struct pages that have not yet been + * initialized, and free_all_bootmem() initializes all the reserved + * deferred pages for us. + */ + register_page_bootmem_info(); + /* * Set up the zero page, mark it reserved, so that page count * is not manipulated when freeing the page from user ptes. From df8ee578894ebb591c2995cce422e6189c8bb757 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:22 -0800 Subject: [PATCH 084/131] sparc64: simplify vmemmap_populate Remove duplicating code by using common functions vmemmap_pud_populate and vmemmap_pgd_populate. Link: http://lkml.kernel.org/r/20171013173214.27300-5-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Acked-by: David S. Miller Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/mm/init_64.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 984e9d65ea0d..051f73401793 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2644,30 +2644,19 @@ int __meminit vmemmap_populate(unsigned long vstart, unsigned long vend, vstart = vstart & PMD_MASK; vend = ALIGN(vend, PMD_SIZE); for (; vstart < vend; vstart += PMD_SIZE) { - pgd_t *pgd = pgd_offset_k(vstart); + pgd_t *pgd = vmemmap_pgd_populate(vstart, node); unsigned long pte; pud_t *pud; pmd_t *pmd; - if (pgd_none(*pgd)) { - pud_t *new = vmemmap_alloc_block(PAGE_SIZE, node); + if (!pgd) + return -ENOMEM; - if (!new) - return -ENOMEM; - pgd_populate(&init_mm, pgd, new); - } - - pud = pud_offset(pgd, vstart); - if (pud_none(*pud)) { - pmd_t *new = vmemmap_alloc_block(PAGE_SIZE, node); - - if (!new) - return -ENOMEM; - pud_populate(&init_mm, pud, new); - } + pud = vmemmap_pud_populate(pgd, vstart, node); + if (!pud) + return -ENOMEM; pmd = pmd_offset(pud, vstart); - pte = pmd_val(*pmd); if (!(pte & _PAGE_VALID)) { void *block = vmemmap_alloc_block(PMD_SIZE, node); From ea1f5f3712afe895dfa4176ec87376b4a9ac23be Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:27 -0800 Subject: [PATCH 085/131] mm: define memblock_virt_alloc_try_nid_raw * A new variant of memblock_virt_alloc_* allocations: memblock_virt_alloc_try_nid_raw() - Does not zero the allocated memory - Does not panic if request cannot be satisfied * optimize early system hash allocations Clients can call alloc_large_system_hash() with flag: HASH_ZERO to specify that memory that was allocated for system hash needs to be zeroed, otherwise the memory does not need to be zeroed, and client will initialize it. If memory does not need to be zero'd, call the new memblock_virt_alloc_raw() interface, and thus improve the boot performance. * debug for raw alloctor When CONFIG_DEBUG_VM is enabled, this patch sets all the memory that is returned by memblock_virt_alloc_try_nid_raw() to ones to ensure that no places excpect zeroed memory. Link: http://lkml.kernel.org/r/20171013173214.27300-6-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/bootmem.h | 27 +++++++++++++++++++ mm/memblock.c | 60 ++++++++++++++++++++++++++++++++++++----- mm/page_alloc.c | 15 +++++------ 3 files changed, 87 insertions(+), 15 deletions(-) diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index fdf40ca04b3c..a53063e9d7d8 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -161,6 +161,9 @@ extern void *__alloc_bootmem_low_node(pg_data_t *pgdat, #define BOOTMEM_ALLOC_ANYWHERE (~(phys_addr_t)0) /* FIXME: Move to memblock.h at a point where we remove nobootmem.c */ +void *memblock_virt_alloc_try_nid_raw(phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, + phys_addr_t max_addr, int nid); void *memblock_virt_alloc_try_nid_nopanic(phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid); @@ -177,6 +180,14 @@ static inline void * __init memblock_virt_alloc( NUMA_NO_NODE); } +static inline void * __init memblock_virt_alloc_raw( + phys_addr_t size, phys_addr_t align) +{ + return memblock_virt_alloc_try_nid_raw(size, align, BOOTMEM_LOW_LIMIT, + BOOTMEM_ALLOC_ACCESSIBLE, + NUMA_NO_NODE); +} + static inline void * __init memblock_virt_alloc_nopanic( phys_addr_t size, phys_addr_t align) { @@ -258,6 +269,14 @@ static inline void * __init memblock_virt_alloc( return __alloc_bootmem(size, align, BOOTMEM_LOW_LIMIT); } +static inline void * __init memblock_virt_alloc_raw( + phys_addr_t size, phys_addr_t align) +{ + if (!align) + align = SMP_CACHE_BYTES; + return __alloc_bootmem_nopanic(size, align, BOOTMEM_LOW_LIMIT); +} + static inline void * __init memblock_virt_alloc_nopanic( phys_addr_t size, phys_addr_t align) { @@ -310,6 +329,14 @@ static inline void * __init memblock_virt_alloc_try_nid(phys_addr_t size, min_addr); } +static inline void * __init memblock_virt_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, int nid) +{ + return ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, align, + min_addr, max_addr); +} + static inline void * __init memblock_virt_alloc_try_nid_nopanic( phys_addr_t size, phys_addr_t align, phys_addr_t min_addr, phys_addr_t max_addr, int nid) diff --git a/mm/memblock.c b/mm/memblock.c index 18dbb69086bc..46aacdfa4f4d 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1327,7 +1327,6 @@ again: return NULL; done: ptr = phys_to_virt(alloc); - memset(ptr, 0, size); /* * The min_count is set to 0 so that bootmem allocated blocks @@ -1340,6 +1339,45 @@ done: return ptr; } +/** + * memblock_virt_alloc_try_nid_raw - allocate boot memory block without zeroing + * memory and without panicking + * @size: size of memory block to be allocated in bytes + * @align: alignment of the region and block's size + * @min_addr: the lower bound of the memory region from where the allocation + * is preferred (phys address) + * @max_addr: the upper bound of the memory region from where the allocation + * is preferred (phys address), or %BOOTMEM_ALLOC_ACCESSIBLE to + * allocate only from memory limited by memblock.current_limit value + * @nid: nid of the free area to find, %NUMA_NO_NODE for any node + * + * Public function, provides additional debug information (including caller + * info), if enabled. Does not zero allocated memory, does not panic if request + * cannot be satisfied. + * + * RETURNS: + * Virtual address of allocated memory block on success, NULL on failure. + */ +void * __init memblock_virt_alloc_try_nid_raw( + phys_addr_t size, phys_addr_t align, + phys_addr_t min_addr, phys_addr_t max_addr, + int nid) +{ + void *ptr; + + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", + __func__, (u64)size, (u64)align, nid, (u64)min_addr, + (u64)max_addr, (void *)_RET_IP_); + + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); +#ifdef CONFIG_DEBUG_VM + if (ptr && size > 0) + memset(ptr, 0xff, size); +#endif + return ptr; +} + /** * memblock_virt_alloc_try_nid_nopanic - allocate boot memory block * @size: size of memory block to be allocated in bytes @@ -1351,8 +1389,8 @@ done: * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public version of _memblock_virt_alloc_try_nid_nopanic() which provides - * additional debug information (including caller info), if enabled. + * Public function, provides additional debug information (including caller + * info), if enabled. This function zeroes the allocated memory. * * RETURNS: * Virtual address of allocated memory block on success, NULL on failure. @@ -1362,11 +1400,17 @@ void * __init memblock_virt_alloc_try_nid_nopanic( phys_addr_t min_addr, phys_addr_t max_addr, int nid) { + void *ptr; + memblock_dbg("%s: %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx %pF\n", __func__, (u64)size, (u64)align, nid, (u64)min_addr, (u64)max_addr, (void *)_RET_IP_); - return memblock_virt_alloc_internal(size, align, min_addr, - max_addr, nid); + + ptr = memblock_virt_alloc_internal(size, align, + min_addr, max_addr, nid); + if (ptr) + memset(ptr, 0, size); + return ptr; } /** @@ -1380,7 +1424,7 @@ void * __init memblock_virt_alloc_try_nid_nopanic( * allocate only from memory limited by memblock.current_limit value * @nid: nid of the free area to find, %NUMA_NO_NODE for any node * - * Public panicking version of _memblock_virt_alloc_try_nid_nopanic() + * Public panicking version of memblock_virt_alloc_try_nid_nopanic() * which provides debug information (including caller info), if enabled, * and panics if the request can not be satisfied. * @@ -1399,8 +1443,10 @@ void * __init memblock_virt_alloc_try_nid( (u64)max_addr, (void *)_RET_IP_); ptr = memblock_virt_alloc_internal(size, align, min_addr, max_addr, nid); - if (ptr) + if (ptr) { + memset(ptr, 0, size); return ptr; + } panic("%s: Failed to allocate %llu bytes align=0x%llx nid=%d from=0x%llx max_addr=0x%llx\n", __func__, (u64)size, (u64)align, nid, (u64)min_addr, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4dee5082d3d7..805f30dd1c26 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7313,18 +7313,17 @@ void *__init alloc_large_system_hash(const char *tablename, log2qty = ilog2(numentries); - /* - * memblock allocator returns zeroed memory already, so HASH_ZERO is - * currently not used when HASH_EARLY is specified. - */ gfp_flags = (flags & HASH_ZERO) ? GFP_ATOMIC | __GFP_ZERO : GFP_ATOMIC; do { size = bucketsize << log2qty; - if (flags & HASH_EARLY) - table = memblock_virt_alloc_nopanic(size, 0); - else if (hashdist) + if (flags & HASH_EARLY) { + if (flags & HASH_ZERO) + table = memblock_virt_alloc_nopanic(size, 0); + else + table = memblock_virt_alloc_raw(size, 0); + } else if (hashdist) { table = __vmalloc(size, gfp_flags, PAGE_KERNEL); - else { + } else { /* * If bucketsize is not a power-of-two, we may free * some pages at the end of hash table which From a4a3ede2132ae0863e2d43e06f9b5697c51a7a3b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:31 -0800 Subject: [PATCH 086/131] mm: zero reserved and unavailable struct pages Some memory is reserved but unavailable: not present in memblock.memory (because not backed by physical pages), but present in memblock.reserved. Such memory has backing struct pages, but they are not initialized by going through __init_single_page(). In some cases these struct pages are accessed even if they do not contain any data. One example is page_to_pfn() might access page->flags if this is where section information is stored (CONFIG_SPARSEMEM, SECTION_IN_PAGE_FLAGS). One example of such memory: trim_low_memory_range() unconditionally reserves from pfn 0, but e820__memblock_setup() might provide the exiting memory from pfn 1 (i.e. KVM). Since struct pages are zeroed in __init_single_page(), and not during allocation time, we must zero such struct pages explicitly. The patch involves adding a new memblock iterator: for_each_resv_unavail_range(i, p_start, p_end) Which iterates through reserved && !memory lists, and we zero struct pages explicitly by calling mm_zero_struct_page(). === Here is more detailed example of problem that this patch is addressing: Run tested on qemu with the following arguments: -enable-kvm -cpu kvm64 -m 512 -smp 2 This patch reports that there are 98 unavailable pages. They are: pfn 0 and pfns in range [159, 255]. Note, trim_low_memory_range() reserves only pfns in range [0, 15], it does not reserve [159, 255] ones. e820__memblock_setup() reports linux that the following physical ranges are available: [1 , 158] [256, 130783] Notice, that exactly unavailable pfns are missing! Now, lets check what we have in zone 0: [1, 131039] pfn 0, is not part of the zone, but pfns [1, 158], are. However, the bigger problem we have if we do not initialize these struct pages is with memory hotplug. Because, that path operates at 2M boundaries (section_nr). And checks if 2M range of pages is hot removable. It starts with first pfn from zone, rounds it down to 2M boundary (sturct pages are allocated at 2M boundaries when vmemmap is created), and checks if that section is hot removable. In this case start with pfn 1 and convert it down to pfn 0. Later pfn is converted to struct page, and some fields are checked. Now, if we do not zero struct pages, we get unpredictable results. In fact when CONFIG_VM_DEBUG is enabled, and we explicitly set all vmemmap memory to ones, the following panic is observed with kernel test without this patch applied: BUG: unable to handle kernel NULL pointer dereference at (null) IP: is_pageblock_removable_nolock+0x35/0x90 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT ... task: ffff88001f4e2900 task.stack: ffffc90000314000 RIP: 0010:is_pageblock_removable_nolock+0x35/0x90 Call Trace: ? is_mem_section_removable+0x5a/0xd0 show_mem_removable+0x6b/0xa0 dev_attr_show+0x1b/0x50 sysfs_kf_seq_show+0xa1/0x100 kernfs_seq_show+0x22/0x30 seq_read+0x1ac/0x3a0 kernfs_fop_read+0x36/0x190 ? security_file_permission+0x90/0xb0 __vfs_read+0x16/0x30 vfs_read+0x81/0x130 SyS_read+0x44/0xa0 entry_SYSCALL_64_fastpath+0x1f/0xbd Link: http://lkml.kernel.org/r/20171013173214.27300-7-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 16 ++++++++++++++++ include/linux/mm.h | 15 +++++++++++++++ mm/page_alloc.c | 40 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 71 insertions(+) diff --git a/include/linux/memblock.h b/include/linux/memblock.h index ce0e5634c2f9..7ed0f7782d16 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -237,6 +237,22 @@ unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); for_each_mem_range_rev(i, &memblock.memory, &memblock.reserved, \ nid, flags, p_start, p_end, p_nid) +/** + * for_each_resv_unavail_range - iterate through reserved and unavailable memory + * @i: u64 used as loop variable + * @flags: pick from blocks based on memory attributes + * @p_start: ptr to phys_addr_t for start address of the range, can be %NULL + * @p_end: ptr to phys_addr_t for end address of the range, can be %NULL + * + * Walks over unavailable but reserved (reserved && !memory) areas of memblock. + * Available as soon as memblock is initialized. + * Note: because this memory does not belong to any physical node, flags and + * nid arguments do not make sense and thus not exported as arguments. + */ +#define for_each_resv_unavail_range(i, p_start, p_end) \ + for_each_mem_range(i, &memblock.reserved, &memblock.memory, \ + NUMA_NO_NODE, MEMBLOCK_NONE, p_start, p_end, NULL) + static inline void memblock_set_region_flags(struct memblock_region *r, unsigned long flags) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c1e82a1aa77..703599fa8288 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -95,6 +95,15 @@ extern int mmap_rnd_compat_bits __read_mostly; #define mm_forbids_zeropage(X) (0) #endif +/* + * On some architectures it is expensive to call memset() for small sizes. + * Those architectures should provide their own implementation of "struct page" + * zeroing by defining this macro in . + */ +#ifndef mm_zero_struct_page +#define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) +#endif + /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a @@ -2030,6 +2039,12 @@ extern int __meminit __early_pfn_to_nid(unsigned long pfn, struct mminit_pfnnid_cache *state); #endif +#ifdef CONFIG_HAVE_MEMBLOCK +void zero_resv_unavail(void); +#else +static inline void zero_resv_unavail(void) {} +#endif + extern void set_dma_reserve(unsigned long new_dma_reserve); extern void memmap_init_zone(unsigned long, int, unsigned long, unsigned long, enum memmap_context); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 805f30dd1c26..c37343ef2889 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6215,6 +6215,44 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, free_area_init_core(pgdat); } +#ifdef CONFIG_HAVE_MEMBLOCK +/* + * Only struct pages that are backed by physical memory are zeroed and + * initialized by going through __init_single_page(). But, there are some + * struct pages which are reserved in memblock allocator and their fields + * may be accessed (for example page_to_pfn() on some configuration accesses + * flags). We must explicitly zero those struct pages. + */ +void __paginginit zero_resv_unavail(void) +{ + phys_addr_t start, end; + unsigned long pfn; + u64 i, pgcnt; + + /* + * Loop through ranges that are reserved, but do not have reported + * physical memory backing. + */ + pgcnt = 0; + for_each_resv_unavail_range(i, &start, &end) { + for (pfn = PFN_DOWN(start); pfn < PFN_UP(end); pfn++) { + mm_zero_struct_page(pfn_to_page(pfn)); + pgcnt++; + } + } + + /* + * Struct pages that do not have backing memory. This could be because + * firmware is using some of this memory, or for some other reasons. + * Once memblock is changed so such behaviour is not allowed: i.e. + * list of "reserved" memory must be a subset of list of "memory", then + * this code can be removed. + */ + if (pgcnt) + pr_info("Reserved but unavailable: %lld pages", pgcnt); +} +#endif /* CONFIG_HAVE_MEMBLOCK */ + #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP #if MAX_NUMNODES > 1 @@ -6638,6 +6676,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) node_set_state(nid, N_MEMORY); check_for_memory(pgdat, nid); } + zero_resv_unavail(); } static int __init cmdline_parse_core(char *p, unsigned long *core) @@ -6801,6 +6840,7 @@ void __init free_area_init(unsigned long *zones_size) { free_area_init_node(0, zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); + zero_resv_unavail(); } static int page_alloc_cpu_dead(unsigned int cpu) From d17a1d97dc208d664c91cc387ffb752c7f85dc61 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 15 Nov 2017 17:36:35 -0800 Subject: [PATCH 087/131] x86/mm/kasan: don't use vmemmap_populate() to initialize shadow The kasan shadow is currently mapped using vmemmap_populate() since that provides a semi-convenient way to map pages into init_top_pgt. However, since that no longer zeroes the mapped pages, it is not suitable for kasan, which requires zeroed shadow memory. Add kasan_populate_shadow() interface and use it instead of vmemmap_populate(). Besides, this allows us to take advantage of gigantic pages and use them to populate the shadow, which should save us some memory wasted on page tables and reduce TLB pressure. Link: http://lkml.kernel.org/r/20171103185147.2688-2-pasha.tatashin@oracle.com Signed-off-by: Andrey Ryabinin Signed-off-by: Pavel Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Michal Hocko Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 2 +- arch/x86/mm/kasan_init_64.c | 143 ++++++++++++++++++++++++++++++++++-- 2 files changed, 137 insertions(+), 8 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb678192da4a..df3276d6bfe3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -110,7 +110,7 @@ config X86 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 2b60dc6e64b1..99dfed6dfef8 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -4,12 +4,14 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include @@ -18,7 +20,134 @@ extern struct range pfn_mapped[E820_MAX_ENTRIES]; static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE); -static int __init map_range(struct range *range) +static __init void *early_alloc(size_t size, int nid) +{ + return memblock_virt_alloc_try_nid_nopanic(size, size, + __pa(MAX_DMA_ADDRESS), BOOTMEM_ALLOC_ACCESSIBLE, nid); +} + +static void __init kasan_populate_pmd(pmd_t *pmd, unsigned long addr, + unsigned long end, int nid) +{ + pte_t *pte; + + if (pmd_none(*pmd)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_PSE) && + ((end - addr) == PMD_SIZE) && + IS_ALIGNED(addr, PMD_SIZE)) { + p = early_alloc(PMD_SIZE, nid); + if (p && pmd_set_huge(pmd, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PMD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pmd_populate_kernel(&init_mm, pmd, p); + } + + pte = pte_offset_kernel(pmd, addr); + do { + pte_t entry; + void *p; + + if (!pte_none(*pte)) + continue; + + p = early_alloc(PAGE_SIZE, nid); + entry = pfn_pte(PFN_DOWN(__pa(p)), PAGE_KERNEL); + set_pte_at(&init_mm, addr, pte, entry); + } while (pte++, addr += PAGE_SIZE, addr != end); +} + +static void __init kasan_populate_pud(pud_t *pud, unsigned long addr, + unsigned long end, int nid) +{ + pmd_t *pmd; + unsigned long next; + + if (pud_none(*pud)) { + void *p; + + if (boot_cpu_has(X86_FEATURE_GBPAGES) && + ((end - addr) == PUD_SIZE) && + IS_ALIGNED(addr, PUD_SIZE)) { + p = early_alloc(PUD_SIZE, nid); + if (p && pud_set_huge(pud, __pa(p), PAGE_KERNEL)) + return; + else if (p) + memblock_free(__pa(p), PUD_SIZE); + } + + p = early_alloc(PAGE_SIZE, nid); + pud_populate(&init_mm, pud, p); + } + + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (!pmd_large(*pmd)) + kasan_populate_pmd(pmd, addr, next, nid); + } while (pmd++, addr = next, addr != end); +} + +static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, + unsigned long end, int nid) +{ + pud_t *pud; + unsigned long next; + + if (p4d_none(*p4d)) { + void *p = early_alloc(PAGE_SIZE, nid); + + p4d_populate(&init_mm, p4d, p); + } + + pud = pud_offset(p4d, addr); + do { + next = pud_addr_end(addr, end); + if (!pud_large(*pud)) + kasan_populate_pud(pud, addr, next, nid); + } while (pud++, addr = next, addr != end); +} + +static void __init kasan_populate_pgd(pgd_t *pgd, unsigned long addr, + unsigned long end, int nid) +{ + void *p; + p4d_t *p4d; + unsigned long next; + + if (pgd_none(*pgd)) { + p = early_alloc(PAGE_SIZE, nid); + pgd_populate(&init_mm, pgd, p); + } + + p4d = p4d_offset(pgd, addr); + do { + next = p4d_addr_end(addr, end); + kasan_populate_p4d(p4d, addr, next, nid); + } while (p4d++, addr = next, addr != end); +} + +static void __init kasan_populate_shadow(unsigned long addr, unsigned long end, + int nid) +{ + pgd_t *pgd; + unsigned long next; + + addr = addr & PAGE_MASK; + end = round_up(end, PAGE_SIZE); + pgd = pgd_offset_k(addr); + do { + next = pgd_addr_end(addr, end); + kasan_populate_pgd(pgd, addr, next, nid); + } while (pgd++, addr = next, addr != end); +} + +static void __init map_range(struct range *range) { unsigned long start; unsigned long end; @@ -26,7 +155,7 @@ static int __init map_range(struct range *range) start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); - return vmemmap_populate(start, end, NUMA_NO_NODE); + kasan_populate_shadow(start, end, early_pfn_to_nid(range->start)); } static void __init clear_pgds(unsigned long start, @@ -189,16 +318,16 @@ void __init kasan_init(void) if (pfn_mapped[i].end == 0) break; - if (map_range(&pfn_mapped[i])) - panic("kasan: unable to allocate shadow!"); + map_range(&pfn_mapped[i]); } + kasan_populate_zero_shadow( kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), kasan_mem_to_shadow((void *)__START_KERNEL_map)); - vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), - (unsigned long)kasan_mem_to_shadow(_end), - NUMA_NO_NODE); + kasan_populate_shadow((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + early_pfn_to_nid(__pa(_stext))); kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); From e17d8025f07e4fd9d73b137a8bcab04548126b83 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 15 Nov 2017 17:36:40 -0800 Subject: [PATCH 088/131] arm64/mm/kasan: don't use vmemmap_populate() to initialize shadow The kasan shadow is currently mapped using vmemmap_populate() since that provides a semi-convenient way to map pages into init_top_pgt. However, since that no longer zeroes the mapped pages, it is not suitable for kasan, which requires zeroed shadow memory. Add kasan_populate_shadow() interface and use it instead of vmemmap_populate(). Besides, this allows us to take advantage of gigantic pages and use them to populate the shadow, which should save us some memory wasted on page tables and reduce TLB pressure. Link: http://lkml.kernel.org/r/20171103185147.2688-3-pasha.tatashin@oracle.com Signed-off-by: Will Deacon Signed-off-by: Pavel Tatashin Cc: Andrey Ryabinin Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Michal Hocko Cc: Alexander Potapenko Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/Kconfig | 2 +- arch/arm64/mm/kasan_init.c | 130 +++++++++++++++++++++++-------------- 2 files changed, 81 insertions(+), 51 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index ba6aab55d464..a93339f5178f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -85,7 +85,7 @@ config ARM64 select HAVE_ARCH_BITREVERSE select HAVE_ARCH_HUGE_VMAP select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if SPARSEMEM_VMEMMAP && !(ARM64_16K_PAGES && ARM64_VA_BITS_48) + select HAVE_ARCH_KASAN if !(ARM64_16K_PAGES && ARM64_VA_BITS_48) select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS select HAVE_ARCH_MMAP_RND_COMPAT_BITS if COMPAT diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 81f03959a4ab..acba49fb5aac 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -11,6 +11,7 @@ */ #define pr_fmt(fmt) "kasan: " fmt +#include #include #include #include @@ -35,77 +36,117 @@ static pgd_t tmp_pg_dir[PTRS_PER_PGD] __initdata __aligned(PGD_SIZE); * with the physical address from __pa_symbol. */ -static void __init kasan_early_pte_populate(pmd_t *pmd, unsigned long addr, - unsigned long end) +static phys_addr_t __init kasan_alloc_zeroed_page(int node) +{ + void *p = memblock_virt_alloc_try_nid(PAGE_SIZE, PAGE_SIZE, + __pa(MAX_DMA_ADDRESS), + MEMBLOCK_ALLOC_ACCESSIBLE, node); + return __pa(p); +} + +static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node, + bool early) +{ + if (pmd_none(*pmd)) { + phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte) + : kasan_alloc_zeroed_page(node); + __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); + } + + return early ? pte_offset_kimg(pmd, addr) + : pte_offset_kernel(pmd, addr); +} + +static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node, + bool early) +{ + if (pud_none(*pud)) { + phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd) + : kasan_alloc_zeroed_page(node); + __pud_populate(pud, pmd_phys, PMD_TYPE_TABLE); + } + + return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr); +} + +static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node, + bool early) +{ + if (pgd_none(*pgd)) { + phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud) + : kasan_alloc_zeroed_page(node); + __pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE); + } + + return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr); +} + +static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end, int node, bool early) { - pte_t *pte; unsigned long next; + pte_t *pte = kasan_pte_offset(pmd, addr, node, early); - if (pmd_none(*pmd)) - __pmd_populate(pmd, __pa_symbol(kasan_zero_pte), PMD_TYPE_TABLE); - - pte = pte_offset_kimg(pmd, addr); do { + phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page) + : kasan_alloc_zeroed_page(node); next = addr + PAGE_SIZE; - set_pte(pte, pfn_pte(sym_to_pfn(kasan_zero_page), - PAGE_KERNEL)); + set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); } while (pte++, addr = next, addr != end && pte_none(*pte)); } -static void __init kasan_early_pmd_populate(pud_t *pud, - unsigned long addr, - unsigned long end) +static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end, int node, bool early) { - pmd_t *pmd; unsigned long next; + pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early); - if (pud_none(*pud)) - __pud_populate(pud, __pa_symbol(kasan_zero_pmd), PMD_TYPE_TABLE); - - pmd = pmd_offset_kimg(pud, addr); do { next = pmd_addr_end(addr, end); - kasan_early_pte_populate(pmd, addr, next); + kasan_pte_populate(pmd, addr, next, node, early); } while (pmd++, addr = next, addr != end && pmd_none(*pmd)); } -static void __init kasan_early_pud_populate(pgd_t *pgd, - unsigned long addr, - unsigned long end) +static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end, int node, bool early) { - pud_t *pud; unsigned long next; + pud_t *pud = kasan_pud_offset(pgd, addr, node, early); - if (pgd_none(*pgd)) - __pgd_populate(pgd, __pa_symbol(kasan_zero_pud), PUD_TYPE_TABLE); - - pud = pud_offset_kimg(pgd, addr); do { next = pud_addr_end(addr, end); - kasan_early_pmd_populate(pud, addr, next); + kasan_pmd_populate(pud, addr, next, node, early); } while (pud++, addr = next, addr != end && pud_none(*pud)); } -static void __init kasan_map_early_shadow(void) +static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, + int node, bool early) { - unsigned long addr = KASAN_SHADOW_START; - unsigned long end = KASAN_SHADOW_END; unsigned long next; pgd_t *pgd; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - kasan_early_pud_populate(pgd, addr, next); + kasan_pud_populate(pgd, addr, next, node, early); } while (pgd++, addr = next, addr != end); } +/* The early shadow maps everything to a single page of zeroes */ asmlinkage void __init kasan_early_init(void) { BUILD_BUG_ON(KASAN_SHADOW_OFFSET != KASAN_SHADOW_END - (1UL << 61)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_START, PGDIR_SIZE)); BUILD_BUG_ON(!IS_ALIGNED(KASAN_SHADOW_END, PGDIR_SIZE)); - kasan_map_early_shadow(); + kasan_pgd_populate(KASAN_SHADOW_START, KASAN_SHADOW_END, NUMA_NO_NODE, + true); +} + +/* Set up full kasan mappings, ensuring that the mapped pages are zeroed */ +static void __init kasan_map_populate(unsigned long start, unsigned long end, + int node) +{ + kasan_pgd_populate(start & PAGE_MASK, PAGE_ALIGN(end), node, false); } /* @@ -142,8 +183,8 @@ void __init kasan_init(void) struct memblock_region *reg; int i; - kimg_shadow_start = (u64)kasan_mem_to_shadow(_text); - kimg_shadow_end = (u64)kasan_mem_to_shadow(_end); + kimg_shadow_start = (u64)kasan_mem_to_shadow(_text) & PAGE_MASK; + kimg_shadow_end = PAGE_ALIGN((u64)kasan_mem_to_shadow(_end)); mod_shadow_start = (u64)kasan_mem_to_shadow((void *)MODULES_VADDR); mod_shadow_end = (u64)kasan_mem_to_shadow((void *)MODULES_END); @@ -161,19 +202,8 @@ void __init kasan_init(void) clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); - vmemmap_populate(kimg_shadow_start, kimg_shadow_end, - pfn_to_nid(virt_to_pfn(lm_alias(_text)))); - - /* - * vmemmap_populate() has populated the shadow region that covers the - * kernel image with SWAPPER_BLOCK_SIZE mappings, so we have to round - * the start and end addresses to SWAPPER_BLOCK_SIZE as well, to prevent - * kasan_populate_zero_shadow() from replacing the page table entries - * (PMD or PTE) at the edges of the shadow region for the kernel - * image. - */ - kimg_shadow_start = round_down(kimg_shadow_start, SWAPPER_BLOCK_SIZE); - kimg_shadow_end = round_up(kimg_shadow_end, SWAPPER_BLOCK_SIZE); + kasan_map_populate(kimg_shadow_start, kimg_shadow_end, + pfn_to_nid(virt_to_pfn(lm_alias(_text)))); kasan_populate_zero_shadow((void *)KASAN_SHADOW_START, (void *)mod_shadow_start); @@ -191,9 +221,9 @@ void __init kasan_init(void) if (start >= end) break; - vmemmap_populate((unsigned long)kasan_mem_to_shadow(start), - (unsigned long)kasan_mem_to_shadow(end), - pfn_to_nid(virt_to_pfn(start))); + kasan_map_populate((unsigned long)kasan_mem_to_shadow(start), + (unsigned long)kasan_mem_to_shadow(end), + pfn_to_nid(virt_to_pfn(start))); } /* From f7f99100d8d95dbcf09e0216a143211e79418b9f Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:44 -0800 Subject: [PATCH 089/131] mm: stop zeroing memory during allocation in vmemmap vmemmap_alloc_block() will no longer zero the block, so zero memory at its call sites for everything except struct pages. Struct page memory is zero'd by struct page initialization. Replace allocators in sparse-vmemmap to use the non-zeroing version. So, we will get the performance improvement by zeroing the memory in parallel when struct pages are zeroed. Add struct page zeroing as a part of initialization of other fields in __init_single_page(). This single thread performance collected on: Intel(R) Xeon(R) CPU E7-8895 v3 @ 2.60GHz with 1T of memory (268400646 pages in 8 nodes): BASE FIX sparse_init 11.244671836s 0.007199623s zone_sizes_init 4.879775891s 8.355182299s -------------------------- Total 16.124447727s 8.362381922s sparse_init is where memory for struct pages is zeroed, and the zeroing part is moved later in this patch into __init_single_page(), which is called from zone_sizes_init(). [akpm@linux-foundation.org: make vmemmap_alloc_block_zero() private to sparse-vmemmap.c] Link: http://lkml.kernel.org/r/20171013173214.27300-10-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Tested-by: Bob Picco Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: David S. Miller Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + mm/sparse-vmemmap.c | 26 ++++++++++++++++++-------- mm/sparse.c | 6 +++--- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c37343ef2889..39e847cd1484 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1168,6 +1168,7 @@ static void free_one_page(struct zone *zone, static void __meminit __init_single_page(struct page *page, unsigned long pfn, unsigned long zone, int nid) { + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 478ce6d4a2c4..4e49762599c8 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -42,7 +42,7 @@ static void * __ref __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return memblock_virt_alloc_try_nid(size, align, goal, + return memblock_virt_alloc_try_nid_raw(size, align, goal, BOOTMEM_ALLOC_ACCESSIBLE, node); } @@ -55,9 +55,8 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) if (slab_is_available()) { struct page *page; - page = alloc_pages_node(node, - GFP_KERNEL | __GFP_ZERO | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_RETRY_MAYFAIL, + get_order(size)); if (page) return page_address(page); return NULL; @@ -180,11 +179,22 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) return pte; } +static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) +{ + void *p = vmemmap_alloc_block(size, node); + + if (!p) + return NULL; + memset(p, 0, size); + + return p; +} + pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); if (pmd_none(*pmd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pmd_populate_kernel(&init_mm, pmd, p); @@ -196,7 +206,7 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pud_populate(&init_mm, pud, p); @@ -208,7 +218,7 @@ p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; p4d_populate(&init_mm, p4d, p); @@ -220,7 +230,7 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node) { pgd_t *pgd = pgd_offset_k(addr); if (pgd_none(*pgd)) { - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_zero(PAGE_SIZE, node); if (!p) return NULL; pgd_populate(&init_mm, pgd, p); diff --git a/mm/sparse.c b/mm/sparse.c index 60805abf98af..7a5dacaa06e3 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -453,9 +453,9 @@ void __init sparse_mem_maps_populate_node(struct page **map_map, } size = PAGE_ALIGN(size); - map = memblock_virt_alloc_try_nid(size * map_count, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS), - BOOTMEM_ALLOC_ACCESSIBLE, nodeid); + map = memblock_virt_alloc_try_nid_raw(size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS), + BOOTMEM_ALLOC_ACCESSIBLE, nodeid); if (map) { for (pnum = pnum_begin; pnum < pnum_end; pnum++) { if (!present_section_nr(pnum)) From 78c943662f4b1d53ddbfc515e427827915781377 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:36:48 -0800 Subject: [PATCH 090/131] sparc64: optimize struct page zeroing Add an optimized mm_zero_struct_page(), so struct page's are zeroed without calling memset(). We do eight to ten regular stores based on the size of struct page. Compiler optimizes out the conditions of switch() statement. SPARC-M6 with 15T of memory, single thread performance: BASE FIX OPTIMIZED_FIX bootmem_init 28.440467985s 2.305674818s 2.305161615s free_area_init_nodes 202.845901673s 225.343084508s 172.556506560s -------------------------------------------- Total 231.286369658s 227.648759326s 174.861668175s BASE: current linux FIX: This patch series without "optimized struct page zeroing" OPTIMIZED_FIX: This patch series including the current patch. bootmem_init() is where memory for struct pages is zeroed during allocation. Note, about two seconds in this function is a fixed time: it does not increase as memory is increased. Link: http://lkml.kernel.org/r/20171013173214.27300-11-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Reviewed-by: Steven Sistare Reviewed-by: Daniel Jordan Reviewed-by: Bob Picco Acked-by: David S. Miller Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Ard Biesheuvel Cc: Catalin Marinas Cc: Christian Borntraeger Cc: Dmitry Vyukov Cc: Heiko Carstens Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mark Rutland Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Michal Hocko Cc: Sam Ravnborg Cc: Thomas Gleixner Cc: Will Deacon Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 30 +++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index fd9d9bac7cfa..5a9e96be1665 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -231,6 +231,36 @@ extern unsigned long _PAGE_ALL_SZ_BITS; extern struct page *mem_map_zero; #define ZERO_PAGE(vaddr) (mem_map_zero) +/* This macro must be updated when the size of struct page grows above 80 + * or reduces below 64. + * The idea that compiler optimizes out switch() statement, and only + * leaves clrx instructions + */ +#define mm_zero_struct_page(pp) do { \ + unsigned long *_pp = (void *)(pp); \ + \ + /* Check that struct page is either 64, 72, or 80 bytes */ \ + BUILD_BUG_ON(sizeof(struct page) & 7); \ + BUILD_BUG_ON(sizeof(struct page) < 64); \ + BUILD_BUG_ON(sizeof(struct page) > 80); \ + \ + switch (sizeof(struct page)) { \ + case 80: \ + _pp[9] = 0; /* fallthrough */ \ + case 72: \ + _pp[8] = 0; /* fallthrough */ \ + default: \ + _pp[7] = 0; \ + _pp[6] = 0; \ + _pp[5] = 0; \ + _pp[4] = 0; \ + _pp[3] = 0; \ + _pp[2] = 0; \ + _pp[1] = 0; \ + _pp[0] = 0; \ + } \ +} while (0) + /* PFNs are real physical page numbers. However, mem_map only begins to record * per-page information starting at pfn_base. This is to handle systems where * the first physical page in the machine is at some huge physical address, From 85ccc8fa81af74c3c9133cf243fb75f65d02a59a Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 15 Nov 2017 17:36:53 -0800 Subject: [PATCH 091/131] mm/page_alloc: make sure __rmqueue() etc are always inline __rmqueue(), __rmqueue_fallback(), __rmqueue_smallest() and __rmqueue_cma_fallback() are all in page allocator's hot path and better be finished as soon as possible. One way to make them faster is by making them inline. But as Andrew Morton and Andi Kleen pointed out: https://lkml.org/lkml/2017/10/10/1252 https://lkml.org/lkml/2017/10/10/1279 To make sure they are inlined, we should use __always_inline for them. With the will-it-scale/page_fault1/process benchmark, when using nr_cpu processes to stress buddy, the results for will-it-scale.processes with and without the patch are: On a 2-sockets Intel-Skylake machine: compiler base head gcc-4.4.7 6496131 6911823 +6.4% gcc-4.9.4 7225110 7731072 +7.0% gcc-5.4.1 7054224 7688146 +9.0% gcc-6.2.0 7059794 7651675 +8.4% On a 4-sockets Intel-Skylake machine: compiler base head gcc-4.4.7 13162890 13508193 +2.6% gcc-4.9.4 14997463 15484353 +3.2% gcc-5.4.1 14708711 15449805 +5.0% gcc-6.2.0 14574099 15349204 +5.3% The above 4 compilers are used because I've done the tests through Intel's Linux Kernel Performance(LKP) infrastructure and they are the available compilers there. The benefit being less on 4 sockets machine is due to the lock contention there(perf-profile/native_queued_spin_lock_slowpath=81%) is less severe than on the 2 sockets machine(85%). What the benchmark does is: it forks nr_cpu processes and then each process does the following: 1 mmap() 128M anonymous space; 2 writes to each page there to trigger actual page allocation; 3 munmap() it. in a loop. https://github.com/antonblanchard/will-it-scale/blob/master/tests/page_fault1.c Binary size wise, I have locally built them with different compilers: [aaron@aaronlu obj]$ size */*/mm/page_alloc.o text data bss dec hex filename 37409 9904 8524 55837 da1d gcc-4.9.4/base/mm/page_alloc.o 38273 9904 8524 56701 dd7d gcc-4.9.4/head/mm/page_alloc.o 37465 9840 8428 55733 d9b5 gcc-5.5.0/base/mm/page_alloc.o 38169 9840 8428 56437 dc75 gcc-5.5.0/head/mm/page_alloc.o 37573 9840 8428 55841 da21 gcc-6.4.0/base/mm/page_alloc.o 38261 9840 8428 56529 dcd1 gcc-6.4.0/head/mm/page_alloc.o 36863 9840 8428 55131 d75b gcc-7.2.0/base/mm/page_alloc.o 37711 9840 8428 55979 daab gcc-7.2.0/head/mm/page_alloc.o Text size increased about 800 bytes for mm/page_alloc.o. [aaron@aaronlu obj]$ size */*/vmlinux text data bss dec hex filename 10342757 5903208 17723392 33969357 20654cd gcc-4.9.4/base/vmlinux 10342757 5903208 17723392 33969357 20654cd gcc-4.9.4/head/vmlinux 10332448 5836608 17715200 33884256 2050860 gcc-5.5.0/base/vmlinux 10332448 5836608 17715200 33884256 2050860 gcc-5.5.0/head/vmlinux 10094546 5836696 17715200 33646442 201676a gcc-6.4.0/base/vmlinux 10094546 5836696 17715200 33646442 201676a gcc-6.4.0/head/vmlinux 10018775 5828732 17715200 33562707 2002053 gcc-7.2.0/base/vmlinux 10018775 5828732 17715200 33562707 2002053 gcc-7.2.0/head/vmlinux Text size for vmlinux has no change though, probably due to function alignment. Link: http://lkml.kernel.org/r/20171013063111.GA26032@intel.com Signed-off-by: Aaron Lu Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Andi Kleen Cc: Huang Ying Cc: Tim Chen Cc: Kemi Wang Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 39e847cd1484..ab648e359602 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1813,7 +1813,7 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags * Go through the free lists for the given migratetype and remove * the smallest available page from the freelists */ -static inline +static __always_inline struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, int migratetype) { @@ -1857,7 +1857,7 @@ static int fallbacks[MIGRATE_TYPES][4] = { }; #ifdef CONFIG_CMA -static struct page *__rmqueue_cma_fallback(struct zone *zone, +static __always_inline struct page *__rmqueue_cma_fallback(struct zone *zone, unsigned int order) { return __rmqueue_smallest(zone, order, MIGRATE_CMA); @@ -2238,7 +2238,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, * deviation from the rest of this file, to make the for loop * condition simpler. */ -static inline bool +static __always_inline bool __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) { struct free_area *area; @@ -2310,8 +2310,8 @@ do_steal: * Do the hard work of removing an element from the buddy allocator. * Call me with the zone->lock already held. */ -static struct page *__rmqueue(struct zone *zone, unsigned int order, - int migratetype) +static __always_inline struct page * +__rmqueue(struct zone *zone, unsigned int order, int migratetype) { struct page *page; From 00bb31fa44acf9591018810ff9d5ab375779c1cc Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 15 Nov 2017 17:36:56 -0800 Subject: [PATCH 092/131] userfaultfd: use mmgrab instead of open-coded increment of mm_count Link: http://lkml.kernel.org/r/1508132478-7738-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Cc: Andrea Arcangeli Cc: "Dr . David Alan Gilbert" Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index f46d133c0949..ac9a4e65ca49 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -668,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->features = octx->features; ctx->released = false; ctx->mm = vma->vm_mm; - atomic_inc(&ctx->mm->mm_count); + mmgrab(ctx->mm); userfaultfd_ctx_get(octx); fctx->orig = octx; From b6b18aa87b5d61fcca8b5b35372d705e915eb374 Mon Sep 17 00:00:00 2001 From: Laszlo Toth Date: Wed, 15 Nov 2017 17:37:00 -0800 Subject: [PATCH 093/131] mm, soft_offline: improve hugepage soft offlining error log On a failed attempt, we get the following entry: soft offline: 0x3c0000: migration failed 1, type 17ffffc0008008 (uptodate|head) Make this more specific to be straightforward and to follow other error log formats in soft_offline_huge_page(). Link: http://lkml.kernel.org/r/20171016171757.GA3018@ubuntu-desk-vm Signed-off-by: Laszlo Toth Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 88366626c0b7..4acdf393a801 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1587,7 +1587,7 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { - pr_info("soft offline: %#lx: migration failed %d, type %lx (%pGp)\n", + pr_info("soft offline: %#lx: hugepage migration failed %d, type %lx (%pGp)\n", pfn, ret, page->flags, &page->flags); if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); From 9823e51bfd47e1d556b47b0061baeb2f05497bef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 15 Nov 2017 17:37:04 -0800 Subject: [PATCH 094/131] mm/page-writeback.c: convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. Link: http://lkml.kernel.org/r/20171016225913.GA99214@beast Signed-off-by: Kees Cook Reviewed-by: Jan Kara Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Matthew Wilcox Cc: Jeff Layton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 231651a1486d..83c746577aea 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -628,9 +628,9 @@ EXPORT_SYMBOL_GPL(wb_writeout_inc); * On idle system, we can be called long after we scheduled because we use * deferred timers so count with missed periods. */ -static void writeout_period(unsigned long t) +static void writeout_period(struct timer_list *t) { - struct wb_domain *dom = (void *)t; + struct wb_domain *dom = from_timer(dom, t, period_timer); int miss_periods = (jiffies - dom->period_time) / VM_COMPLETIONS_PERIOD_LEN; @@ -653,8 +653,7 @@ int wb_domain_init(struct wb_domain *dom, gfp_t gfp) spin_lock_init(&dom->lock); - setup_deferrable_timer(&dom->period_timer, writeout_period, - (unsigned long)dom); + timer_setup(&dom->period_timer, writeout_period, TIMER_DEFERRABLE); dom->dirty_limit_tstamp = jiffies; From 384bc41fc064bd8b12b7081aa3e81d26f3407045 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 15 Nov 2017 17:37:08 -0800 Subject: [PATCH 095/131] drivers/block/zram/zram_drv.c: make zram_page_end_io() static zram_page_end_io() is local to the source and does not need to be in global scope, so make it static. Cleans up sparse warning: symbol 'zram_page_end_io' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20171016173336.20320-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 88564222f473..d70eba30003a 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -428,7 +428,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry) WARN_ON_ONCE(!was_set); } -void zram_page_end_io(struct bio *bio) +static void zram_page_end_io(struct bio *bio) { struct page *page = bio->bi_io_vec[0].bv_page; From 736304f3245f39392895ff3392e1325d3e49e7d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:11 -0800 Subject: [PATCH 096/131] mm: speed up cancel_dirty_page() for clean pages Patch series "Speed up page cache truncation", v1. When rebasing our enterprise distro to a newer kernel (from 4.4 to 4.12) we have noticed a regression in bonnie++ benchmark when deleting files. Eventually we have tracked this down to a fact that page cache truncation got slower by about 10%. There were both gains and losses in the above interval of kernels but we have been able to identify that commit 83929372f629 ("filemap: prepare find and delete operations for huge pages") caused about 10% regression on its own. After some investigation it didn't seem easily possible to fix the regression while maintaining the THP in page cache functionality so we've decided to optimize the page cache truncation path instead to make up for the change. This series is a result of that effort. Patch 1 is an easy speedup of cancel_dirty_page(). Patches 2-6 refactor page cache truncation code so that it is easier to batch radix tree operations. Patch 7 implements batching of deletes from the radix tree which more than makes up for the original regression. This patch (of 7): cancel_dirty_page() does quite some work even for clean pages (fetching of mapping, locking of memcg, atomic bit op on page flags) so it accounts for ~2.5% of cost of truncation of a clean page. That is not much but still dumb for something we don't need at all. Check whether a page is actually dirty and avoid any work if not. Link: http://lkml.kernel.org/r/20171010151937.26984-2-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Hansen Cc: Dave Chinner Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 8 +++++++- mm/page-writeback.c | 4 ++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 703599fa8288..c7b1d617dff6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1440,7 +1440,13 @@ void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); -void cancel_dirty_page(struct page *page); +void __cancel_dirty_page(struct page *page); +static inline void cancel_dirty_page(struct page *page) +{ + /* Avoid atomic ops, locking, etc. when not actually needed. */ + if (PageDirty(page)) + __cancel_dirty_page(page); +} int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_struct *task, char *buffer, int buflen); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 83c746577aea..436714917e03 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2608,7 +2608,7 @@ EXPORT_SYMBOL(set_page_dirty_lock); * page without actually doing it through the VM. Can you say "ext3 is * horribly ugly"? Thought you could. */ -void cancel_dirty_page(struct page *page) +void __cancel_dirty_page(struct page *page) { struct address_space *mapping = page_mapping(page); @@ -2629,7 +2629,7 @@ void cancel_dirty_page(struct page *page) ClearPageDirty(page); } } -EXPORT_SYMBOL(cancel_dirty_page); +EXPORT_SYMBOL(__cancel_dirty_page); /* * Clear a page's dirty flag, while caring for dirty memory accounting. From 9f4e41f4717832e34cca153ced62b4a1d7e26c0e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:15 -0800 Subject: [PATCH 097/131] mm: refactor truncate_complete_page() Move call of delete_from_page_cache() and page->mapping check out of truncate_complete_page() into the single caller - truncate_inode_page(). Also move page_mapped() check into truncate_complete_page(). That way it will be easier to batch operations. Also rename truncate_complete_page() to truncate_cleanup_page(). Link: http://lkml.kernel.org/r/20171010151937.26984-3-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 2330223841fb..383a530d511e 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -134,11 +134,17 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static int -truncate_complete_page(struct address_space *mapping, struct page *page) +static void +truncate_cleanup_page(struct address_space *mapping, struct page *page) { - if (page->mapping != mapping) - return -EIO; + if (page_mapped(page)) { + loff_t holelen; + + holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_SHIFT, + holelen, 0); + } if (page_has_private(page)) do_invalidatepage(page, 0, PAGE_SIZE); @@ -150,8 +156,6 @@ truncate_complete_page(struct address_space *mapping, struct page *page) */ cancel_dirty_page(page); ClearPageMappedToDisk(page); - delete_from_page_cache(page); - return 0; } /* @@ -180,16 +184,14 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) int truncate_inode_page(struct address_space *mapping, struct page *page) { - loff_t holelen; VM_BUG_ON_PAGE(PageTail(page), page); - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); - } - return truncate_complete_page(mapping, page); + if (page->mapping != mapping) + return -EIO; + + truncate_cleanup_page(mapping, page); + delete_from_page_cache(page); + return 0; } /* From 59c66c5f8c4fb823240d70553c1686ce4e4dd331 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:18 -0800 Subject: [PATCH 098/131] mm: factor out page cache page freeing into a separate function Factor out page freeing from delete_from_page_cache() into a separate function. We will need to call the same when batching pagecache deletion operations. invalidate_complete_page2() and replace_page_cache_page() might want to call this function as well however they currently don't seem to handle THPs so it's unnecessary for them to take the hit of checking whether a page is THP or not. Link: http://lkml.kernel.org/r/20171010151937.26984-4-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 6eb4e32d99c8..ecf7565ff435 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -254,6 +254,23 @@ void __delete_from_page_cache(struct page *page, void *shadow) account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } +static void page_cache_free_page(struct address_space *mapping, + struct page *page) +{ + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + if (freepage) + freepage(page); + + if (PageTransHuge(page) && !PageHuge(page)) { + page_ref_sub(page, HPAGE_PMD_NR); + VM_BUG_ON_PAGE(page_count(page) <= 0, page); + } else { + put_page(page); + } +} + /** * delete_from_page_cache - delete page from page cache * @page: the page which the kernel is trying to remove from page cache @@ -266,25 +283,13 @@ void delete_from_page_cache(struct page *page) { struct address_space *mapping = page_mapping(page); unsigned long flags; - void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); - - freepage = mapping->a_ops->freepage; - spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL); spin_unlock_irqrestore(&mapping->tree_lock, flags); - if (freepage) - freepage(page); - - if (PageTransHuge(page) && !PageHuge(page)) { - page_ref_sub(page, HPAGE_PMD_NR); - VM_BUG_ON_PAGE(page_count(page) <= 0, page); - } else { - put_page(page); - } + page_cache_free_page(mapping, page); } EXPORT_SYMBOL(delete_from_page_cache); From 76253fbc8fbf6018401755fc5c07814a837cc832 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:22 -0800 Subject: [PATCH 099/131] mm: move accounting updates before page_cache_tree_delete() Move updates of various counters before page_cache_tree_delete() call. It will be easier to batch things this way and there is no difference whether the counters get updated before or after removal from the radix tree. Link: http://lkml.kernel.org/r/20171010151937.26984-5-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index ecf7565ff435..014109e66e4a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -224,34 +224,35 @@ void __delete_from_page_cache(struct page *page, void *shadow) } } + /* hugetlb pages do not participate in page cache accounting. */ + if (!PageHuge(page)) { + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + if (PageSwapBacked(page)) { + __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + if (PageTransHuge(page)) + __dec_node_page_state(page, NR_SHMEM_THPS); + } else { + VM_BUG_ON_PAGE(PageTransHuge(page), page); + } + + /* + * At this point page must be either written or cleaned by + * truncate. Dirty page here signals a bug and loss of + * unwritten data. + * + * This fixes dirty accounting after removing the page entirely + * but leaves PageDirty set: it has no effect for truncated + * page and anyway will be cleared before returning page into + * buddy allocator. + */ + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping, + inode_to_wb(mapping->host)); + } page_cache_tree_delete(mapping, page, shadow); page->mapping = NULL; /* Leave page->index set: truncation lookup relies upon it */ - - /* hugetlb pages do not participate in page cache accounting. */ - if (PageHuge(page)) - return; - - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); - if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); - if (PageTransHuge(page)) - __dec_node_page_state(page, NR_SHMEM_THPS); - } else { - VM_BUG_ON_PAGE(PageTransHuge(page), page); - } - - /* - * At this point page must be either written or cleaned by truncate. - * Dirty page here signals a bug and loss of unwritten data. - * - * This fixes dirty accounting after removing the page entirely but - * leaves PageDirty set: it has no effect for truncated page and - * anyway will be cleared before returning page into buddy allocator. - */ - if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); } static void page_cache_free_page(struct address_space *mapping, From 2300638b124645c26d082dbb57841878202ff6f7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:26 -0800 Subject: [PATCH 100/131] mm: move clearing of page->mapping to page_cache_tree_delete() Clearing of page->mapping makes sense in page_cache_tree_delete() as well and it will help us with batching things this way. Link: http://lkml.kernel.org/r/20171010151937.26984-6-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 014109e66e4a..c649624d386c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -165,6 +165,9 @@ static void page_cache_tree_delete(struct address_space *mapping, workingset_update_node, mapping); } + page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ + if (shadow) { mapping->nrexceptional += nr; /* @@ -250,9 +253,6 @@ void __delete_from_page_cache(struct page *page, void *shadow) inode_to_wb(mapping->host)); } page_cache_tree_delete(mapping, page, shadow); - - page->mapping = NULL; - /* Leave page->index set: truncation lookup relies upon it */ } static void page_cache_free_page(struct address_space *mapping, From 5ecc4d852c03b82646bf563460091b95f6a8c7c0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:29 -0800 Subject: [PATCH 101/131] mm: factor out checks and accounting from __delete_from_page_cache() Move checks and accounting updates from __delete_from_page_cache() into a separate function. We will reuse it when batching page cache truncation operations. Link: http://lkml.kernel.org/r/20171010151937.26984-7-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 72 ++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index c649624d386c..a11b42189436 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -181,17 +181,11 @@ static void page_cache_tree_delete(struct address_space *mapping, mapping->nrpages -= nr; } -/* - * Delete a page from the page cache and free it. Caller has to make - * sure the page is locked and that nobody else uses it - or that usage - * is safe. The caller must hold the mapping's tree_lock. - */ -void __delete_from_page_cache(struct page *page, void *shadow) +static void unaccount_page_cache_page(struct address_space *mapping, + struct page *page) { - struct address_space *mapping = page->mapping; - int nr = hpage_nr_pages(page); + int nr; - trace_mm_filemap_delete_from_page_cache(page); /* * if we're uptodate, flush out into the cleancache, otherwise * invalidate any existing cleancache entries. We can't leave @@ -228,30 +222,46 @@ void __delete_from_page_cache(struct page *page, void *shadow) } /* hugetlb pages do not participate in page cache accounting. */ - if (!PageHuge(page)) { - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); - if (PageSwapBacked(page)) { - __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); - if (PageTransHuge(page)) - __dec_node_page_state(page, NR_SHMEM_THPS); - } else { - VM_BUG_ON_PAGE(PageTransHuge(page), page); - } + if (PageHuge(page)) + return; - /* - * At this point page must be either written or cleaned by - * truncate. Dirty page here signals a bug and loss of - * unwritten data. - * - * This fixes dirty accounting after removing the page entirely - * but leaves PageDirty set: it has no effect for truncated - * page and anyway will be cleared before returning page into - * buddy allocator. - */ - if (WARN_ON_ONCE(PageDirty(page))) - account_page_cleaned(page, mapping, - inode_to_wb(mapping->host)); + nr = hpage_nr_pages(page); + + __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr); + if (PageSwapBacked(page)) { + __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr); + if (PageTransHuge(page)) + __dec_node_page_state(page, NR_SHMEM_THPS); + } else { + VM_BUG_ON_PAGE(PageTransHuge(page), page); } + + /* + * At this point page must be either written or cleaned by + * truncate. Dirty page here signals a bug and loss of + * unwritten data. + * + * This fixes dirty accounting after removing the page entirely + * but leaves PageDirty set: it has no effect for truncated + * page and anyway will be cleared before returning page into + * buddy allocator. + */ + if (WARN_ON_ONCE(PageDirty(page))) + account_page_cleaned(page, mapping, inode_to_wb(mapping->host)); +} + +/* + * Delete a page from the page cache and free it. Caller has to make + * sure the page is locked and that nobody else uses it - or that usage + * is safe. The caller must hold the mapping's tree_lock. + */ +void __delete_from_page_cache(struct page *page, void *shadow) +{ + struct address_space *mapping = page->mapping; + + trace_mm_filemap_delete_from_page_cache(page); + + unaccount_page_cache_page(mapping, page); page_cache_tree_delete(mapping, page, shadow); } From aa65c29ce1b6e1990cd2c7d8004bbea7ff3aff38 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 15 Nov 2017 17:37:33 -0800 Subject: [PATCH 102/131] mm: batch radix tree operations when truncating pages Currently we remove pages from the radix tree one by one. To speed up page cache truncation, lock several pages at once and free them in one go. This allows us to batch radix tree operations in a more efficient way and also save round-trips on mapping->tree_lock. As a result we gain about 20% speed improvement in page cache truncation. Data from a simple benchmark timing 10000 truncates of 1024 pages (on ext4 on ramdisk but the filesystem is barely visible in the profiles). The range shows 1% and 95% percentiles of the measured times: 4.14-rc2 4.14-rc2 + batched truncation 248-256 209-219 249-258 209-217 248-255 211-239 248-255 209-217 247-256 210-218 [jack@suse.cz: convert delete_from_page_cache_batch() to pagevec] Link: http://lkml.kernel.org/r/20171018111648.13714-1-jack@suse.cz [akpm@linux-foundation.org: move struct pagevec forward declaration to top-of-file] Link: http://lkml.kernel.org/r/20171010151937.26984-8-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Mel Gorman Reviewed-by: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 4 ++ mm/filemap.c | 83 +++++++++++++++++++++++++++++++++++++++++ mm/truncate.c | 20 +++++++++- 3 files changed, 105 insertions(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 956d6a80e126..e0f7181118fe 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -16,6 +16,8 @@ #include /* for in_interrupt() */ #include +struct pagevec; + /* * Bits in mapping->flags. */ @@ -624,6 +626,8 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: diff --git a/mm/filemap.c b/mm/filemap.c index a11b42189436..a470dd8cd05b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -304,6 +304,89 @@ void delete_from_page_cache(struct page *page) } EXPORT_SYMBOL(delete_from_page_cache); +/* + * page_cache_tree_delete_batch - delete several pages from page cache + * @mapping: the mapping to which pages belong + * @pvec: pagevec with pages to delete + * + * The function walks over mapping->page_tree and removes pages passed in @pvec + * from the radix tree. The function expects @pvec to be sorted by page index. + * It tolerates holes in @pvec (radix tree entries at those indices are not + * modified). The function expects only THP head pages to be present in the + * @pvec and takes care to delete all corresponding tail pages from the radix + * tree as well. + * + * The function expects mapping->tree_lock to be held. + */ +static void +page_cache_tree_delete_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + struct radix_tree_iter iter; + void **slot; + int total_pages = 0; + int i = 0, tail_pages = 0; + struct page *page; + pgoff_t start; + + start = pvec->pages[0]->index; + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { + if (i >= pagevec_count(pvec) && !tail_pages) + break; + page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (radix_tree_exceptional_entry(page)) + continue; + if (!tail_pages) { + /* + * Some page got inserted in our range? Skip it. We + * have our pages locked so they are protected from + * being removed. + */ + if (page != pvec->pages[i]) + continue; + WARN_ON_ONCE(!PageLocked(page)); + if (PageTransHuge(page) && !PageHuge(page)) + tail_pages = HPAGE_PMD_NR - 1; + page->mapping = NULL; + /* + * Leave page->index set: truncation lookup relies + * upon it + */ + i++; + } else { + tail_pages--; + } + radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); + __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, + workingset_update_node, mapping); + total_pages++; + } + mapping->nrpages -= total_pages; +} + +void delete_from_page_cache_batch(struct address_space *mapping, + struct pagevec *pvec) +{ + int i; + unsigned long flags; + + if (!pagevec_count(pvec)) + return; + + spin_lock_irqsave(&mapping->tree_lock, flags); + for (i = 0; i < pagevec_count(pvec); i++) { + trace_mm_filemap_delete_from_page_cache(pvec->pages[i]); + + unaccount_page_cache_page(mapping, pvec->pages[i]); + } + page_cache_tree_delete_batch(mapping, pvec); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + for (i = 0; i < pagevec_count(pvec); i++) + page_cache_free_page(mapping, pvec->pages[i]); +} + int filemap_check_errors(struct address_space *mapping) { int ret = 0; diff --git a/mm/truncate.c b/mm/truncate.c index 383a530d511e..4a39a3150ee2 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -294,6 +294,14 @@ void truncate_inode_pages_range(struct address_space *mapping, while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), indices)) { + /* + * Pagevec array has exceptional entries and we may also fail + * to lock some pages. So we store pages that can be deleted + * in a new pagevec. + */ + struct pagevec locked_pvec; + + pagevec_init(&locked_pvec, 0); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -315,9 +323,17 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - truncate_inode_page(mapping, page); - unlock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + continue; + } + pagevec_add(&locked_pvec, page); } + for (i = 0; i < pagevec_count(&locked_pvec); i++) + truncate_cleanup_page(mapping, locked_pvec.pages[i]); + delete_from_page_cache_batch(mapping, &locked_pvec); + for (i = 0; i < pagevec_count(&locked_pvec); i++) + unlock_page(locked_pvec.pages[i]); pagevec_remove_exceptionals(&pvec); pagevec_release(&pvec); cond_resched(); From 9cca35d42eb61b69e108a17215756c46173a5e6f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:37 -0800 Subject: [PATCH 103/131] mm, page_alloc: enable/disable IRQs once when freeing a list of pages Patch series "Follow-up for speed up page cache truncation", v2. This series is a follow-on for Jan Kara's series "Speed up page cache truncation" series. We both ended up looking at the same problem but saw different problems based on the same data. This series builds upon his work. A variety of workloads were compared on four separate machines but each machine showed gains albeit at different levels. Minimally, some of the differences are due to NUMA where truncating data from a remote node is slower than a local node. The workloads checked were o sparse truncate microbenchmark, tiny o sparse truncate microbenchmark, large o reaim-io disk workfile o dbench4 (modified by mmtests to produce more stable results) o filebench varmail configuration for small memory size o bonnie, directory operations, working set size 2*RAM reaim-io, dbench and filebench all showed minor gains. Truncation does not dominate those workloads but were tested to ensure no other regressions. They will not be reported further. The sparse truncate microbench was written by Jan. It creates a number of files and then times how long it takes to truncate each one. The "tiny" configuraiton creates a number of files that easily fits in memory and times how long it takes to truncate files with page cache. The large configuration uses enough files to have data that is twice the size of memory and so timings there include truncating page cache and working set shadow entries in the radix tree. Patches 1-4 are the most relevant parts of this series. Patches 5-8 are optional as they are deleting code that is essentially useless but has a negligible performance impact. The changelogs have more information on performance but just for bonnie delete options, the main comparison is bonnie 4.14.0-rc5 4.14.0-rc5 4.14.0-rc5 jan-v2 vanilla mel-v2 Hmean SeqCreate ops 76.20 ( 0.00%) 75.80 ( -0.53%) 76.80 ( 0.79%) Hmean SeqCreate read 85.00 ( 0.00%) 85.00 ( 0.00%) 85.00 ( 0.00%) Hmean SeqCreate del 13752.31 ( 0.00%) 12090.23 ( -12.09%) 15304.84 ( 11.29%) Hmean RandCreate ops 76.00 ( 0.00%) 75.60 ( -0.53%) 77.00 ( 1.32%) Hmean RandCreate read 96.80 ( 0.00%) 96.80 ( 0.00%) 97.00 ( 0.21%) Hmean RandCreate del 13233.75 ( 0.00%) 11525.35 ( -12.91%) 14446.61 ( 9.16%) Jan's series is the baseline and the vanilla kernel is 12% slower where as this series on top gains another 11%. This is from a different machine than the data in the changelogs but the detailed data was not collected as there was no substantial change in v2. This patch (of 8): Freeing a list of pages current enables/disables IRQs for each page freed. This patch splits freeing a list of pages into two operations -- preparing the pages for freeing and the actual freeing. This is a tradeoff - we're taking two passes of the list to free in exchange for avoiding multiple enable/disable of IRQs. sparsetruncate (tiny) 4.14.0-rc4 4.14.0-rc4 janbatch-v1r1 oneirq-v1r1 Min Time 149.00 ( 0.00%) 141.00 ( 5.37%) 1st-qrtle Time 150.00 ( 0.00%) 142.00 ( 5.33%) 2nd-qrtle Time 151.00 ( 0.00%) 142.00 ( 5.96%) 3rd-qrtle Time 151.00 ( 0.00%) 143.00 ( 5.30%) Max-90% Time 153.00 ( 0.00%) 144.00 ( 5.88%) Max-95% Time 155.00 ( 0.00%) 147.00 ( 5.16%) Max-99% Time 201.00 ( 0.00%) 195.00 ( 2.99%) Max Time 236.00 ( 0.00%) 230.00 ( 2.54%) Amean Time 152.65 ( 0.00%) 144.37 ( 5.43%) Stddev Time 9.78 ( 0.00%) 10.44 ( -6.72%) Coeff Time 6.41 ( 0.00%) 7.23 ( -12.84%) Best99%Amean Time 152.07 ( 0.00%) 143.72 ( 5.50%) Best95%Amean Time 150.75 ( 0.00%) 142.37 ( 5.56%) Best90%Amean Time 150.59 ( 0.00%) 142.19 ( 5.58%) Best75%Amean Time 150.36 ( 0.00%) 141.92 ( 5.61%) Best50%Amean Time 150.04 ( 0.00%) 141.69 ( 5.56%) Best25%Amean Time 149.85 ( 0.00%) 141.38 ( 5.65%) With a tiny number of files, each file truncated has resident page cache and it shows that time to truncate is roughtly 5-6% with some minor jitter. 4.14.0-rc4 4.14.0-rc4 janbatch-v1r1 oneirq-v1r1 Hmean SeqCreate ops 65.27 ( 0.00%) 81.86 ( 25.43%) Hmean SeqCreate read 39.48 ( 0.00%) 47.44 ( 20.16%) Hmean SeqCreate del 24963.95 ( 0.00%) 26319.99 ( 5.43%) Hmean RandCreate ops 65.47 ( 0.00%) 82.01 ( 25.26%) Hmean RandCreate read 42.04 ( 0.00%) 51.75 ( 23.09%) Hmean RandCreate del 23377.66 ( 0.00%) 23764.79 ( 1.66%) As expected, there is a small gain for the delete operation. [mgorman@techsingularity.net: use page_private and set_page_private helpers] Link: http://lkml.kernel.org/r/20171018101547.mjycw7zreb66jzpa@techsingularity.net Link: http://lkml.kernel.org/r/20171018075952.10627-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Jan Kara Cc: Andi Kleen Cc: Dave Hansen Cc: Dave Chinner Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 60 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab648e359602..6a3c4a1d513f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2611,24 +2611,26 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -/* - * Free a 0-order page - * cold == true ? free a cold page : free a hot page - */ -void free_hot_cold_page(struct page *page, bool cold) +static bool free_hot_cold_page_prepare(struct page *page, unsigned long pfn) { - struct zone *zone = page_zone(page); - struct per_cpu_pages *pcp; - unsigned long flags; - unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pcp_prepare(page)) - return; + return false; migratetype = get_pfnblock_migratetype(page, pfn); set_pcppage_migratetype(page, migratetype); - local_irq_save(flags); + return true; +} + +static void free_hot_cold_page_commit(struct page *page, unsigned long pfn, + bool cold) +{ + struct zone *zone = page_zone(page); + struct per_cpu_pages *pcp; + int migratetype; + + migratetype = get_pcppage_migratetype(page); __count_vm_event(PGFREE); /* @@ -2641,7 +2643,7 @@ void free_hot_cold_page(struct page *page, bool cold) if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { free_one_page(zone, page, pfn, 0, migratetype); - goto out; + return; } migratetype = MIGRATE_MOVABLE; } @@ -2657,8 +2659,22 @@ void free_hot_cold_page(struct page *page, bool cold) free_pcppages_bulk(zone, batch, pcp); pcp->count -= batch; } +} -out: +/* + * Free a 0-order page + * cold == true ? free a cold page : free a hot page + */ +void free_hot_cold_page(struct page *page, bool cold) +{ + unsigned long flags; + unsigned long pfn = page_to_pfn(page); + + if (!free_hot_cold_page_prepare(page, pfn)) + return; + + local_irq_save(flags); + free_hot_cold_page_commit(page, pfn, cold); local_irq_restore(flags); } @@ -2668,11 +2684,25 @@ out: void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; + unsigned long flags, pfn; + /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { - trace_mm_page_free_batched(page, cold); - free_hot_cold_page(page, cold); + pfn = page_to_pfn(page); + if (!free_hot_cold_page_prepare(page, pfn)) + list_del(&page->lru); + set_page_private(page, pfn); } + + local_irq_save(flags); + list_for_each_entry_safe(page, next, list, lru) { + unsigned long pfn = page_private(page); + + set_page_private(page, 0); + trace_mm_page_free_batched(page, cold); + free_hot_cold_page_commit(page, pfn, cold); + } + local_irq_restore(flags); } /* From c7df8ad2910e965a6241b6d8f52fd122e26b0315 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:41 -0800 Subject: [PATCH 104/131] mm, truncate: do not check mapping for every page being truncated During truncation, the mapping has already been checked for shmem and dax so it's known that workingset_update_node is required. This patch avoids the checks on mapping for each page being truncated. In all other cases, a lookup helper is used to determine if workingset_update_node() needs to be called. The one danger is that the API is slightly harder to use as calling workingset_update_node directly without checking for dax or shmem mappings could lead to surprises. However, the API rarely needs to be used and hopefully the comment is enough to give people the hint. sparsetruncate (tiny) 4.14.0-rc4 4.14.0-rc4 oneirq-v1r1 pickhelper-v1r1 Min Time 141.00 ( 0.00%) 140.00 ( 0.71%) 1st-qrtle Time 142.00 ( 0.00%) 141.00 ( 0.70%) 2nd-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 3rd-qrtle Time 143.00 ( 0.00%) 143.00 ( 0.00%) Max-90% Time 144.00 ( 0.00%) 144.00 ( 0.00%) Max-95% Time 147.00 ( 0.00%) 145.00 ( 1.36%) Max-99% Time 195.00 ( 0.00%) 191.00 ( 2.05%) Max Time 230.00 ( 0.00%) 205.00 ( 10.87%) Amean Time 144.37 ( 0.00%) 143.82 ( 0.38%) Stddev Time 10.44 ( 0.00%) 9.00 ( 13.74%) Coeff Time 7.23 ( 0.00%) 6.26 ( 13.41%) Best99%Amean Time 143.72 ( 0.00%) 143.34 ( 0.26%) Best95%Amean Time 142.37 ( 0.00%) 142.00 ( 0.26%) Best90%Amean Time 142.19 ( 0.00%) 141.85 ( 0.24%) Best75%Amean Time 141.92 ( 0.00%) 141.58 ( 0.24%) Best50%Amean Time 141.69 ( 0.00%) 141.31 ( 0.27%) Best25%Amean Time 141.38 ( 0.00%) 140.97 ( 0.29%) As you'd expect, the gain is marginal but it can be detected. The differences in bonnie are all within the noise which is not surprising given the impact on the microbenchmark. radix_tree_update_node_t is a callback for some radix operations that optionally passes in a private field. The only user of the callback is workingset_update_node and as it no longer requires a mapping, the private field is removed. Link: http://lkml.kernel.org/r/20171018075952.10627-3-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- include/linux/radix-tree.h | 7 +++---- include/linux/swap.h | 13 +++++++++++- lib/idr.c | 2 +- lib/radix-tree.c | 30 ++++++++++++--------------- mm/filemap.c | 7 ++++--- mm/shmem.c | 2 +- mm/truncate.c | 2 +- mm/workingset.c | 10 ++------- tools/testing/radix-tree/multiorder.c | 2 +- 10 files changed, 39 insertions(+), 38 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 9ec797424e4f..165fdfb6e508 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -565,7 +565,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); __radix_tree_replace(page_tree, node, slot, - new_entry, NULL, NULL); + new_entry, NULL); entry = new_entry; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 567ebb5eaab0..0ca448c1cb42 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -301,18 +301,17 @@ void *__radix_tree_lookup(const struct radix_tree_root *, unsigned long index, void *radix_tree_lookup(const struct radix_tree_root *, unsigned long); void __rcu **radix_tree_lookup_slot(const struct radix_tree_root *, unsigned long index); -typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *); void __radix_tree_replace(struct radix_tree_root *, struct radix_tree_node *, void __rcu **slot, void *entry, - radix_tree_update_node_t update_node, void *private); + radix_tree_update_node_t update_node); void radix_tree_iter_replace(struct radix_tree_root *, const struct radix_tree_iter *, void __rcu **slot, void *entry); void radix_tree_replace_slot(struct radix_tree_root *, void __rcu **slot, void *entry); void __radix_tree_delete_node(struct radix_tree_root *, struct radix_tree_node *, - radix_tree_update_node_t update_node, - void *private); + radix_tree_update_node_t update_node); void radix_tree_iter_delete(struct radix_tree_root *, struct radix_tree_iter *iter, void __rcu **slot); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/include/linux/swap.h b/include/linux/swap.h index 8b8a6f965785..454f042bcdd5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -298,7 +298,18 @@ struct vma_swap_readahead { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -void workingset_update_node(struct radix_tree_node *node, void *private); + +/* Do not use directly, use workingset_lookup_update */ +void workingset_update_node(struct radix_tree_node *node); + +/* Returns workingset_update_node() if the mapping has shadow entries. */ +#define workingset_lookup_update(mapping) \ +({ \ + radix_tree_update_node_t __helper = workingset_update_node; \ + if (dax_mapping(mapping) || shmem_mapping(mapping)) \ + __helper = NULL; \ + __helper; \ +}) /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/idr.c b/lib/idr.c index edd9b2be1651..2593ce513a18 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -171,7 +171,7 @@ void *idr_replace_ext(struct idr *idr, void *ptr, unsigned long id) if (!slot || radix_tree_tag_get(&idr->idr_rt, id, IDR_FREE)) return ERR_PTR(-ENOENT); - __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL, NULL); + __radix_tree_replace(&idr->idr_rt, node, slot, ptr, NULL); return entry; } diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8b1feca1230a..c8d55565fafa 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -677,8 +677,7 @@ out: * @root radix tree root */ static inline bool radix_tree_shrink(struct radix_tree_root *root, - radix_tree_update_node_t update_node, - void *private) + radix_tree_update_node_t update_node) { bool shrunk = false; @@ -739,7 +738,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, if (!radix_tree_is_internal_node(child)) { node->slots[0] = (void __rcu *)RADIX_TREE_RETRY; if (update_node) - update_node(node, private); + update_node(node); } WARN_ON_ONCE(!list_empty(&node->private_list)); @@ -752,7 +751,7 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, static bool delete_node(struct radix_tree_root *root, struct radix_tree_node *node, - radix_tree_update_node_t update_node, void *private) + radix_tree_update_node_t update_node) { bool deleted = false; @@ -762,8 +761,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node_to_entry(node) == rcu_dereference_raw(root->rnode)) - deleted |= radix_tree_shrink(root, update_node, - private); + deleted |= radix_tree_shrink(root, + update_node); return deleted; } @@ -1173,7 +1172,6 @@ static int calculate_count(struct radix_tree_root *root, * @slot: pointer to slot in @node * @item: new item to store in the slot. * @update_node: callback for changing leaf nodes - * @private: private data to pass to @update_node * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. @@ -1181,7 +1179,7 @@ static int calculate_count(struct radix_tree_root *root, void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void __rcu **slot, void *item, - radix_tree_update_node_t update_node, void *private) + radix_tree_update_node_t update_node) { void *old = rcu_dereference_raw(*slot); int exceptional = !!radix_tree_exceptional_entry(item) - @@ -1201,9 +1199,9 @@ void __radix_tree_replace(struct radix_tree_root *root, return; if (update_node) - update_node(node, private); + update_node(node); - delete_node(root, node, update_node, private); + delete_node(root, node, update_node); } /** @@ -1225,7 +1223,7 @@ void __radix_tree_replace(struct radix_tree_root *root, void radix_tree_replace_slot(struct radix_tree_root *root, void __rcu **slot, void *item) { - __radix_tree_replace(root, NULL, slot, item, NULL, NULL); + __radix_tree_replace(root, NULL, slot, item, NULL); } EXPORT_SYMBOL(radix_tree_replace_slot); @@ -1242,7 +1240,7 @@ void radix_tree_iter_replace(struct radix_tree_root *root, const struct radix_tree_iter *iter, void __rcu **slot, void *item) { - __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); + __radix_tree_replace(root, iter->node, slot, item, NULL); } #ifdef CONFIG_RADIX_TREE_MULTIORDER @@ -1972,7 +1970,6 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); * @root: radix tree root * @node: node containing @index * @update_node: callback for changing leaf nodes - * @private: private data to pass to @update_node * * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the @@ -1980,10 +1977,9 @@ EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); */ void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node, - radix_tree_update_node_t update_node, - void *private) + radix_tree_update_node_t update_node) { - delete_node(root, node, update_node, private); + delete_node(root, node, update_node); } static bool __radix_tree_delete(struct radix_tree_root *root, @@ -2001,7 +1997,7 @@ static bool __radix_tree_delete(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); replace_slot(slot, NULL, node, -1, exceptional); - return node && delete_node(root, node, NULL, NULL); + return node && delete_node(root, node, NULL); } /** diff --git a/mm/filemap.c b/mm/filemap.c index a470dd8cd05b..155370fc87f2 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -35,6 +35,7 @@ #include #include #include +#include #include #include "internal.h" @@ -134,7 +135,7 @@ static int page_cache_tree_insert(struct address_space *mapping, *shadowp = p; } __radix_tree_replace(&mapping->page_tree, node, slot, page, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); mapping->nrpages++; return 0; } @@ -162,7 +163,7 @@ static void page_cache_tree_delete(struct address_space *mapping, radix_tree_clear_tags(&mapping->page_tree, node, slot); __radix_tree_replace(&mapping->page_tree, node, slot, shadow, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); } page->mapping = NULL; @@ -359,7 +360,7 @@ page_cache_tree_delete_batch(struct address_space *mapping, } radix_tree_clear_tags(&mapping->page_tree, iter.node, slot); __radix_tree_replace(&mapping->page_tree, iter.node, slot, NULL, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); total_pages++; } mapping->nrpages -= total_pages; diff --git a/mm/shmem.c b/mm/shmem.c index 07a1d22807be..a72f68aee6a4 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -338,7 +338,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, if (item != expected) return -ENOENT; __radix_tree_replace(&mapping->page_tree, node, pslot, - replacement, NULL, NULL); + replacement, NULL); return 0; } diff --git a/mm/truncate.c b/mm/truncate.c index 4a39a3150ee2..02a0c0466c78 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -42,7 +42,7 @@ static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, if (*slot != entry) goto unlock; __radix_tree_replace(&mapping->page_tree, node, slot, NULL, - workingset_update_node, mapping); + workingset_update_node); mapping->nrexceptional--; unlock: spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/workingset.c b/mm/workingset.c index b997c9de28f6..b7d616a3bbbe 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -340,14 +340,8 @@ out: static struct list_lru shadow_nodes; -void workingset_update_node(struct radix_tree_node *node, void *private) +void workingset_update_node(struct radix_tree_node *node) { - struct address_space *mapping = private; - - /* Only regular page cache has shadow entries */ - if (dax_mapping(mapping) || shmem_mapping(mapping)) - return; - /* * Track non-empty nodes that contain only shadow entries; * unlink those that contain pages or are being freed. @@ -475,7 +469,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, goto out_invalid; inc_lruvec_page_state(virt_to_page(node), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node, - workingset_update_node, mapping); + workingset_lookup_update(mapping)); out_invalid: spin_unlock(&mapping->tree_lock); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 06c71178d07d..59245b3d587c 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -618,7 +618,7 @@ static void multiorder_account(void) __radix_tree_insert(&tree, 1 << 5, 5, (void *)0x12); __radix_tree_lookup(&tree, 1 << 5, &node, &slot); assert(node->count == node->exceptional * 2); - __radix_tree_replace(&tree, node, slot, NULL, NULL, NULL); + __radix_tree_replace(&tree, node, slot, NULL, NULL); assert(node->exceptional == 0); item_kill_tree(&tree); From f2187599189d94aeeee2fa5d9806186c7732ed37 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:44 -0800 Subject: [PATCH 105/131] mm, truncate: remove all exceptional entries from pagevec under one lock During truncate each entry in a pagevec is checked to see if it is an exceptional entry and if so, the shadow entry is cleaned up. This is potentially expensive as multiple entries for a mapping locks/unlocks the tree lock. This batches the operation such that any exceptional entries removed from a pagevec only acquire the mapping tree lock once. The corner case where this is more expensive is where there is only one exceptional entry but this is unlikely due to temporal locality and how it affects LRU ordering. Note that for truncations of small files created recently, this patch should show no gain because it only batches the handling of exceptional entries. sparsetruncate (large) 4.14.0-rc4 4.14.0-rc4 pickhelper-v1r1 batchshadow-v1r1 Min Time 38.00 ( 0.00%) 27.00 ( 28.95%) 1st-qrtle Time 40.00 ( 0.00%) 28.00 ( 30.00%) 2nd-qrtle Time 44.00 ( 0.00%) 41.00 ( 6.82%) 3rd-qrtle Time 146.00 ( 0.00%) 147.00 ( -0.68%) Max-90% Time 153.00 ( 0.00%) 153.00 ( 0.00%) Max-95% Time 155.00 ( 0.00%) 156.00 ( -0.65%) Max-99% Time 181.00 ( 0.00%) 171.00 ( 5.52%) Amean Time 93.04 ( 0.00%) 88.43 ( 4.96%) Best99%Amean Time 92.08 ( 0.00%) 86.13 ( 6.46%) Best95%Amean Time 89.19 ( 0.00%) 83.13 ( 6.80%) Best90%Amean Time 85.60 ( 0.00%) 79.15 ( 7.53%) Best75%Amean Time 72.95 ( 0.00%) 65.09 ( 10.78%) Best50%Amean Time 39.86 ( 0.00%) 28.20 ( 29.25%) Best25%Amean Time 39.44 ( 0.00%) 27.70 ( 29.77%) bonnie 4.14.0-rc4 4.14.0-rc4 pickhelper-v1r1 batchshadow-v1r1 Hmean SeqCreate ops 71.92 ( 0.00%) 76.78 ( 6.76%) Hmean SeqCreate read 42.42 ( 0.00%) 45.01 ( 6.10%) Hmean SeqCreate del 26519.88 ( 0.00%) 27191.87 ( 2.53%) Hmean RandCreate ops 71.92 ( 0.00%) 76.95 ( 7.00%) Hmean RandCreate read 44.44 ( 0.00%) 49.23 ( 10.78%) Hmean RandCreate del 24948.62 ( 0.00%) 24764.97 ( -0.74%) Truncation of a large number of files shows a substantial gain with 99% of files being truncated 6.46% faster. bonnie shows a modest gain of 2.53% [jack@suse.cz: fix truncate_exceptional_pvec_entries()] Link: http://lkml.kernel.org/r/20171108164226.26788-1-jack@suse.cz Link: http://lkml.kernel.org/r/20171018075952.10627-4-mgorman@techsingularity.net Signed-off-by: Mel Gorman Signed-off-by: Jan Kara Reviewed-by: Jan Kara Acked-by: Johannes Weiner Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 91 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 63 insertions(+), 28 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 02a0c0466c78..c30e8fa3d063 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -25,44 +25,85 @@ #include #include "internal.h" -static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, - void *entry) +/* + * Regular page slots are stabilized by the page lock even without the tree + * itself locked. These unlocked entries need verification under the tree + * lock. + */ +static inline void __clear_shadow_entry(struct address_space *mapping, + pgoff_t index, void *entry) { struct radix_tree_node *node; void **slot; - spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; + return; if (*slot != entry) - goto unlock; + return; __radix_tree_replace(&mapping->page_tree, node, slot, NULL, workingset_update_node); mapping->nrexceptional--; -unlock: +} + +static void clear_shadow_entry(struct address_space *mapping, pgoff_t index, + void *entry) +{ + spin_lock_irq(&mapping->tree_lock); + __clear_shadow_entry(mapping, index, entry); spin_unlock_irq(&mapping->tree_lock); } /* - * Unconditionally remove exceptional entry. Usually called from truncate path. + * Unconditionally remove exceptional entries. Usually called from truncate + * path. Note that the pagevec may be altered by this function by removing + * exceptional entries similar to what pagevec_remove_exceptionals does. */ -static void truncate_exceptional_entry(struct address_space *mapping, - pgoff_t index, void *entry) +static void truncate_exceptional_pvec_entries(struct address_space *mapping, + struct pagevec *pvec, pgoff_t *indices, + pgoff_t end) { + int i, j; + bool dax, lock; + /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; - if (dax_mapping(mapping)) { - dax_delete_mapping_entry(mapping, index); + for (j = 0; j < pagevec_count(pvec); j++) + if (radix_tree_exceptional_entry(pvec->pages[j])) + break; + + if (j == pagevec_count(pvec)) return; + + dax = dax_mapping(mapping); + lock = !dax && indices[j] < end; + if (lock) + spin_lock_irq(&mapping->tree_lock); + + for (i = j; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + pgoff_t index = indices[i]; + + if (!radix_tree_exceptional_entry(page)) { + pvec->pages[j++] = page; + continue; + } + + if (index >= end) + continue; + + if (unlikely(dax)) { + dax_delete_mapping_entry(mapping, index); + continue; + } + + __clear_shadow_entry(mapping, index, page); } - clear_shadow_entry(mapping, index, entry); + + if (lock) + spin_unlock_irq(&mapping->tree_lock); + pvec->nr = j; } /* @@ -310,11 +351,8 @@ void truncate_inode_pages_range(struct address_space *mapping, if (index >= end) break; - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } if (!trylock_page(page)) continue; @@ -334,12 +372,11 @@ void truncate_inode_pages_range(struct address_space *mapping, delete_from_page_cache_batch(mapping, &locked_pvec); for (i = 0; i < pagevec_count(&locked_pvec); i++) unlock_page(locked_pvec.pages[i]); - pagevec_remove_exceptionals(&pvec); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); cond_resched(); index++; } - if (partial_start) { struct page *page = find_lock_page(mapping, start - 1); if (page) { @@ -397,6 +434,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -408,11 +446,8 @@ void truncate_inode_pages_range(struct address_space *mapping, break; } - if (radix_tree_exceptional_entry(page)) { - truncate_exceptional_entry(mapping, index, - page); + if (radix_tree_exceptional_entry(page)) continue; - } lock_page(page); WARN_ON(page_to_index(page) != index); @@ -420,7 +455,7 @@ void truncate_inode_pages_range(struct address_space *mapping, truncate_inode_page(mapping, page); unlock_page(page); } - pagevec_remove_exceptionals(&pvec); + truncate_exceptional_pvec_entries(mapping, &pvec, indices, end); pagevec_release(&pvec); index++; } From d9ed0d08b6c6a882da1d8e75bb3162fc889fd199 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:48 -0800 Subject: [PATCH 106/131] mm: only drain per-cpu pagevecs once per pagevec usage When a pagevec is initialised on the stack, it is generally used multiple times over a range of pages, looking up entries and then releasing them. On each pagevec_release, the per-cpu deferred LRU pagevecs are drained on the grounds the page being released may be on those queues and the pages may be cache hot. In many cases only the first drain is necessary as it's unlikely that the range of pages being walked is racing against LRU addition. Even if there is such a race, the impact is marginal where as constantly redraining the lru pagevecs costs. This patch ensures that pagevec is only drained once in a given lifecycle without increasing the cache footprint of the pagevec structure. Only sparsetruncate tiny is shown here as large files have many exceptional entries and calls pagecache_release less frequently. sparsetruncate (tiny) 4.14.0-rc4 4.14.0-rc4 batchshadow-v1r1 onedrain-v1r1 Min Time 141.00 ( 0.00%) 141.00 ( 0.00%) 1st-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 2nd-qrtle Time 142.00 ( 0.00%) 142.00 ( 0.00%) 3rd-qrtle Time 143.00 ( 0.00%) 143.00 ( 0.00%) Max-90% Time 144.00 ( 0.00%) 144.00 ( 0.00%) Max-95% Time 146.00 ( 0.00%) 145.00 ( 0.68%) Max-99% Time 198.00 ( 0.00%) 194.00 ( 2.02%) Max Time 254.00 ( 0.00%) 208.00 ( 18.11%) Amean Time 145.12 ( 0.00%) 144.30 ( 0.56%) Stddev Time 12.74 ( 0.00%) 9.62 ( 24.49%) Coeff Time 8.78 ( 0.00%) 6.67 ( 24.06%) Best99%Amean Time 144.29 ( 0.00%) 143.82 ( 0.32%) Best95%Amean Time 142.68 ( 0.00%) 142.31 ( 0.26%) Best90%Amean Time 142.52 ( 0.00%) 142.19 ( 0.24%) Best75%Amean Time 142.26 ( 0.00%) 141.98 ( 0.20%) Best50%Amean Time 141.90 ( 0.00%) 141.71 ( 0.13%) Best25%Amean Time 141.80 ( 0.00%) 141.43 ( 0.26%) The impact on bonnie is marginal and within the noise because a significant percentage of the file being truncated has been reclaimed and consists of shadow entries which reduce the hotness of the pagevec_release path. Link: http://lkml.kernel.org/r/20171018075952.10627-5-mgorman@techsingularity.net Signed-off-by: Mel Gorman Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 4 +++- mm/swap.c | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 4f56e0ad9d00..95c75c858d1f 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,8 @@ struct address_space; struct pagevec { unsigned long nr; - unsigned long cold; + bool cold; + bool drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -54,6 +55,7 @@ static inline void pagevec_init(struct pagevec *pvec, int cold) { pvec->nr = 0; pvec->cold = cold; + pvec->drained = false; } static inline void pagevec_reinit(struct pagevec *pvec) diff --git a/mm/swap.c b/mm/swap.c index 4edac536fe24..3e564a95ee73 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -833,7 +833,10 @@ EXPORT_SYMBOL(release_pages); */ void __pagevec_release(struct pagevec *pvec) { - lru_add_drain(); + if (!pvec->drained) { + lru_add_drain(); + pvec->drained = true; + } release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } From 8667982014d6048e0b5e286b6247ff24f48d4cc6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:52 -0800 Subject: [PATCH 107/131] mm, pagevec: remove cold parameter for pagevecs Every pagevec_init user claims the pages being released are hot even in cases where it is unlikely the pages are hot. As no one cares about the hotness of pages being released to the allocator, just ditch the parameter. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. Link: http://lkml.kernel.org/r/20171018075952.10627-6-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/i915/i915_gem_gtt.c | 2 +- fs/afs/write.c | 4 ++-- fs/btrfs/extent_io.c | 4 ++-- fs/buffer.c | 4 ++-- fs/cachefiles/rdwr.c | 4 ++-- fs/ceph/addr.c | 4 ++-- fs/dax.c | 2 +- fs/ext4/inode.c | 6 +++--- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 2 +- fs/f2fs/node.c | 8 ++++---- fs/fscache/page.c | 2 +- fs/gfs2/aops.c | 2 +- fs/hugetlbfs/inode.c | 2 +- fs/nilfs2/btree.c | 2 +- fs/nilfs2/page.c | 8 ++++---- fs/nilfs2/segment.c | 4 ++-- include/linux/pagevec.h | 4 +--- mm/filemap.c | 2 +- mm/mlock.c | 4 ++-- mm/page-writeback.c | 2 +- mm/shmem.c | 6 +++--- mm/swap.c | 4 ++-- mm/truncate.c | 8 ++++---- 24 files changed, 45 insertions(+), 47 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_gem_gtt.c b/drivers/gpu/drm/i915/i915_gem_gtt.c index ad524cb0f6fc..7982ad817c11 100644 --- a/drivers/gpu/drm/i915/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/i915_gem_gtt.c @@ -1859,7 +1859,7 @@ static void i915_address_space_init(struct i915_address_space *vm, INIT_LIST_HEAD(&vm->unbound_list); list_add_tail(&vm->global_link, &dev_priv->vm_list); - pagevec_init(&vm->free_pages, false); + pagevec_init(&vm->free_pages); } static void i915_address_space_fini(struct i915_address_space *vm) diff --git a/fs/afs/write.c b/fs/afs/write.c index d62a6b54152d..11dd0526b96b 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -308,7 +308,7 @@ static void afs_kill_pages(struct afs_vnode *vnode, bool error, _enter("{%x:%u},%lx-%lx", vnode->fid.vid, vnode->fid.vnode, first, last); - pagevec_init(&pv, 0); + pagevec_init(&pv); do { _debug("kill %lx-%lx", first, last); @@ -602,7 +602,7 @@ void afs_pages_written_back(struct afs_vnode *vnode, struct afs_call *call) ASSERT(wb != NULL); - pagevec_init(&pv, 0); + pagevec_init(&pv); do { _debug("done %lx-%lx", first, last); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 93fdaac81212..16045ea86fc1 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3797,7 +3797,7 @@ int btree_write_cache_pages(struct address_space *mapping, int scanned = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; @@ -3936,7 +3936,7 @@ static int extent_write_cache_pages(struct address_space *mapping, if (!igrab(inode)) return 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { index = mapping->writeback_index; /* Start from prev offset */ end = -1; diff --git a/fs/buffer.c b/fs/buffer.c index 1c18a22a6013..0736a6a2e2f0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1592,7 +1592,7 @@ void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len) struct buffer_head *head; end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (pagevec_lookup_range(&pvec, bd_mapping, &index, end)) { count = pagevec_count(&pvec); for (i = 0; i < count; i++) { @@ -3514,7 +3514,7 @@ page_cache_seek_hole_data(struct inode *inode, loff_t offset, loff_t length, if (length <= 0) return -ENOENT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); do { unsigned nr_pages, i; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 18d7aa61ef0f..23097cca2674 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -710,7 +710,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; - pagevec_init(&pagevec, 0); + pagevec_init(&pagevec); op->op.flags &= FSCACHE_OP_KEEP_FLAGS; op->op.flags |= FSCACHE_OP_ASYNC; @@ -844,7 +844,7 @@ int cachefiles_allocate_pages(struct fscache_retrieval *op, ret = cachefiles_has_space(cache, 0, *nr_pages); if (ret == 0) { - pagevec_init(&pagevec, 0); + pagevec_init(&pagevec); list_for_each_entry(page, pages, lru) { if (pagevec_add(&pagevec, page) == 0) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26c682db94ee..dbf07051aacd 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -680,7 +680,7 @@ static void ceph_release_pages(struct page **pages, int num) struct pagevec pvec; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); for (i = 0; i < num; i++) { if (pagevec_add(&pvec, pages[i]) == 0) pagevec_release(&pvec); @@ -811,7 +811,7 @@ static int ceph_writepages_start(struct address_space *mapping, if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); start_index = wbc->range_cyclic ? mapping->writeback_index : 0; index = start_index; diff --git a/fs/dax.c b/fs/dax.c index 165fdfb6e508..3652b26a0048 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -794,7 +794,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, tag_pages_for_writeback(mapping, start_index, end_index); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (!done) { pvec.nr = find_get_entries_tag(mapping, start_index, PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b239a41dbeeb..8d2b582fb141 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1719,7 +1719,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, ext4_es_remove_extent(inode, start, last - start + 1); } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end) { nr_pages = pagevec_lookup_range(&pvec, mapping, &index, end); if (nr_pages == 0) @@ -2345,7 +2345,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) lblk = start << bpp_bits; pblock = mpd->map.m_pblk; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (start <= end) { nr_pages = pagevec_lookup_range(&pvec, inode->i_mapping, &start, end); @@ -2616,7 +2616,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) else tag = PAGECACHE_TAG_DIRTY; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); mpd->map.m_len = 0; mpd->next_page = index; while (index <= end) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6124f8710dc3..0bb8e2c022d3 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -314,7 +314,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, }; struct blk_plug plug; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); blk_start_plug(&plug); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 687703755824..7b3ad5d8e2e9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1635,7 +1635,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (get_dirty_pages(mapping->host) <= SM_I(F2FS_M_SB(mapping))->min_hot_blocks) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d6e4df0bb622..b33dac9592ca 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1282,7 +1282,7 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) struct page *last_page = NULL; int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, @@ -1436,7 +1436,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, return PTR_ERR_OR_ZERO(last_page); } retry: - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = 0; while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, @@ -1547,7 +1547,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, int ret = 0; int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next_step: index = 0; @@ -1648,7 +1648,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) int ret2, ret = 0; int nr_pages; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while ((nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, PAGECACHE_TAG_WRITEBACK))) { diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 0ad3fd3ad0b4..961029e04027 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1175,7 +1175,7 @@ void __fscache_uncache_all_inode_pages(struct fscache_cookie *cookie, return; } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next = 0; do { if (!pagevec_lookup(&pvec, mapping, &next)) diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 3fea3d7780b0..1daf15a1f00c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -371,7 +371,7 @@ static int gfs2_write_cache_jdata(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6adb17bb1a38..1e76730aac0d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -407,7 +407,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); next = start; while (next < end) { /* diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 35989c7bb065..16a7a67a11c9 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2156,7 +2156,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, level++) INIT_LIST_HEAD(&lists[level]); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY)) { diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 1c16726915c1..68241512d7c1 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -255,7 +255,7 @@ int nilfs_copy_dirty_pages(struct address_space *dmap, pgoff_t index = 0; int err = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY)) return 0; @@ -309,7 +309,7 @@ void nilfs_copy_back_pages(struct address_space *dmap, pgoff_t index = 0; int err; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: n = pagevec_lookup(&pvec, smap, &index); if (!n) @@ -373,7 +373,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping, bool silent) unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY)) { @@ -518,7 +518,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, index = start_blk >> (PAGE_SHIFT - inode->i_blkbits); nblocks_in_page = 1U << (PAGE_SHIFT - inode->i_blkbits); - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: pvec.nr = find_get_pages_contig(inode->i_mapping, index, PAGEVEC_SIZE, diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 19366ab20bea..f65392fecb5c 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -708,7 +708,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, index = start >> PAGE_SHIFT; last = end >> PAGE_SHIFT; } - pagevec_init(&pvec, 0); + pagevec_init(&pvec); repeat: if (unlikely(index > last) || !pagevec_lookup_range_tag(&pvec, mapping, &index, last, @@ -753,7 +753,7 @@ static void nilfs_lookup_dirty_node_buffers(struct inode *inode, unsigned int i; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY)) { diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 95c75c858d1f..eebefd209424 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,6 @@ struct address_space; struct pagevec { unsigned long nr; - bool cold; bool drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -51,10 +50,9 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, return pagevec_lookup_range_tag(pvec, mapping, index, (pgoff_t)-1, tag); } -static inline void pagevec_init(struct pagevec *pvec, int cold) +static inline void pagevec_init(struct pagevec *pvec) { pvec->nr = 0; - pvec->cold = cold; pvec->drained = false; } diff --git a/mm/filemap.c b/mm/filemap.c index 155370fc87f2..90a9f261f85f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -519,7 +519,7 @@ static void __filemap_fdatawait_range(struct address_space *mapping, if (end_byte < start_byte) return; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end) { unsigned i; diff --git a/mm/mlock.c b/mm/mlock.c index 46af369c13e5..ed37cb208d19 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone) struct pagevec pvec_putback; int pgrescued = 0; - pagevec_init(&pvec_putback, 0); + pagevec_init(&pvec_putback); /* Phase 1: page isolation */ spin_lock_irq(zone_lru_lock(zone)); @@ -448,7 +448,7 @@ void munlock_vma_pages_range(struct vm_area_struct *vma, struct pagevec pvec; struct zone *zone; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); /* * Although FOLL_DUMP is intended for get_dump_page(), * it just so happens that its special treatment of the diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 436714917e03..05313f402ba8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2168,7 +2168,7 @@ int write_cache_pages(struct address_space *mapping, int range_whole = 0; int tag; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; diff --git a/mm/shmem.c b/mm/shmem.c index a72f68aee6a4..7ea8b276ba8b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -747,7 +747,7 @@ void shmem_unlock_mapping(struct address_space *mapping) pgoff_t indices[PAGEVEC_SIZE]; pgoff_t index = 0; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); /* * Minor point, but we might as well stop if someone else SHM_LOCKs it. */ @@ -790,7 +790,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend, if (lend == -1) end = -1; /* unsigned, so actually very big */ - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end) { pvec.nr = find_get_entries(mapping, index, @@ -2528,7 +2528,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping, bool done = false; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); pvec.nr = 1; /* start small: we may be there already */ while (!done) { pvec.nr = find_get_entries(mapping, index, diff --git a/mm/swap.c b/mm/swap.c index 3e564a95ee73..88a19b6cdf7c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, } if (pgdat) spin_unlock_irqrestore(&pgdat->lru_lock, flags); - release_pages(pvec->pages, pvec->nr, pvec->cold); + release_pages(pvec->pages, pvec->nr, 0); pagevec_reinit(pvec); } @@ -837,7 +837,7 @@ void __pagevec_release(struct pagevec *pvec) lru_add_drain(); pvec->drained = true; } - release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); + release_pages(pvec->pages, pagevec_count(pvec), 0); pagevec_reinit(pvec); } EXPORT_SYMBOL(__pagevec_release); diff --git a/mm/truncate.c b/mm/truncate.c index c30e8fa3d063..e4b4cf0f4070 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -330,7 +330,7 @@ void truncate_inode_pages_range(struct address_space *mapping, else end = (lend + 1) >> PAGE_SHIFT; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index < end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE), @@ -342,7 +342,7 @@ void truncate_inode_pages_range(struct address_space *mapping, */ struct pagevec locked_pvec; - pagevec_init(&locked_pvec, 0); + pagevec_init(&locked_pvec); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -553,7 +553,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, unsigned long count = 0; int i; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, indices)) { @@ -683,7 +683,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, if (mapping->nrpages == 0 && mapping->nrexceptional == 0) goto out; - pagevec_init(&pvec, 0); + pagevec_init(&pvec); index = start; while (index <= end && pagevec_lookup_entries(&pvec, mapping, index, min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1, From c6f92f9fbe7dbcc8903a67229aa88b4077ae4422 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:55 -0800 Subject: [PATCH 108/131] mm: remove cold parameter for release_pages All callers of release_pages claim the pages being released are cache hot. As no one cares about the hotness of pages being released to the allocator, just ditch the parameter. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. Link: http://lkml.kernel.org/r/20171018075952.10627-7-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c | 6 ++---- drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c | 2 +- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 6 +++--- drivers/gpu/drm/i915/i915_gem_userptr.c | 4 ++-- drivers/gpu/drm/radeon/radeon_ttm.c | 2 +- fs/fuse/dev.c | 2 +- include/linux/pagemap.h | 2 +- include/linux/swap.h | 2 +- mm/swap.c | 8 ++++---- mm/swap_state.c | 2 +- 11 files changed, 18 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c index 60d8bedb694d..cd664832f9e8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_cs.c @@ -553,8 +553,7 @@ static int amdgpu_cs_parser_bos(struct amdgpu_cs_parser *p, * invalidated it. Free it and try again */ release_pages(e->user_pages, - e->robj->tbo.ttm->num_pages, - false); + e->robj->tbo.ttm->num_pages); kvfree(e->user_pages); e->user_pages = NULL; } @@ -691,8 +690,7 @@ error_free_pages: continue; release_pages(e->user_pages, - e->robj->tbo.ttm->num_pages, - false); + e->robj->tbo.ttm->num_pages); kvfree(e->user_pages); } } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c index 6149a47fe63d..0bda8f2a188a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gem.c @@ -347,7 +347,7 @@ int amdgpu_gem_userptr_ioctl(struct drm_device *dev, void *data, return 0; free_pages: - release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages, false); + release_pages(bo->tbo.ttm->pages, bo->tbo.ttm->num_pages); unlock_mmap_sem: up_read(¤t->mm->mmap_sem); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c index bc746131987f..d792959fac43 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c @@ -659,7 +659,7 @@ int amdgpu_ttm_tt_get_user_pages(struct ttm_tt *ttm, struct page **pages) return 0; release_pages: - release_pages(pages, pinned, 0); + release_pages(pages, pinned); return r; } diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 57881167ccd2..bcc8c2d7c7c9 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -779,7 +779,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( up_read(&mm->mmap_sem); if (ret < 0) { - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); return ERR_PTR(ret); } @@ -852,7 +852,7 @@ static int etnaviv_gem_userptr_get_pages(struct etnaviv_gem_object *etnaviv_obj) } } - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); work = kmalloc(sizeof(*work), GFP_KERNEL); @@ -886,7 +886,7 @@ static void etnaviv_gem_userptr_release(struct etnaviv_gem_object *etnaviv_obj) if (etnaviv_obj->pages) { int npages = etnaviv_obj->base.size >> PAGE_SHIFT; - release_pages(etnaviv_obj->pages, npages, 0); + release_pages(etnaviv_obj->pages, npages); kvfree(etnaviv_obj->pages); } put_task_struct(etnaviv_obj->userptr.task); diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 709efe2357ea..aa22361bd5a1 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -554,7 +554,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) } mutex_unlock(&obj->mm.lock); - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); i915_gem_object_put(obj); @@ -668,7 +668,7 @@ i915_gem_userptr_get_pages(struct drm_i915_gem_object *obj) __i915_gem_userptr_set_active(obj, true); if (IS_ERR(pages)) - release_pages(pvec, pinned, 0); + release_pages(pvec, pinned); kvfree(pvec); return pages; diff --git a/drivers/gpu/drm/radeon/radeon_ttm.c b/drivers/gpu/drm/radeon/radeon_ttm.c index bf69bf9086bf..1fdfc7a46072 100644 --- a/drivers/gpu/drm/radeon/radeon_ttm.c +++ b/drivers/gpu/drm/radeon/radeon_ttm.c @@ -597,7 +597,7 @@ release_sg: kfree(ttm->sg); release_pages: - release_pages(ttm->pages, pinned, 0); + release_pages(ttm->pages, pinned); return r; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index a42d89371748..17f0d05bfd4c 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1636,7 +1636,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, false); + release_pages(req->pages, req->num_pages); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index e0f7181118fe..4c6790bb7afb 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -118,7 +118,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) m->gfp_mask = mask; } -void release_pages(struct page **pages, int nr, bool cold); +void release_pages(struct page **pages, int nr); /* * speculatively take a reference to a page. diff --git a/include/linux/swap.h b/include/linux/swap.h index 454f042bcdd5..c2b8128799c1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,7 @@ static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ - release_pages((pages), (nr), false); + release_pages((pages), (nr)); static inline void show_swap_cache_info(void) { diff --git a/mm/swap.c b/mm/swap.c index 88a19b6cdf7c..29cf75f1a860 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -210,7 +210,7 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, } if (pgdat) spin_unlock_irqrestore(&pgdat->lru_lock, flags); - release_pages(pvec->pages, pvec->nr, 0); + release_pages(pvec->pages, pvec->nr); pagevec_reinit(pvec); } @@ -740,7 +740,7 @@ void lru_add_drain_all(void) * Decrement the reference count on all the pages in @pages. If it * fell to zero, remove the page from the LRU and free it. */ -void release_pages(struct page **pages, int nr, bool cold) +void release_pages(struct page **pages, int nr) { int i; LIST_HEAD(pages_to_free); @@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr, bool cold) spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); - free_hot_cold_page_list(&pages_to_free, cold); + free_hot_cold_page_list(&pages_to_free, 0); } EXPORT_SYMBOL(release_pages); @@ -837,7 +837,7 @@ void __pagevec_release(struct pagevec *pvec) lru_add_drain(); pvec->drained = true; } - release_pages(pvec->pages, pagevec_count(pvec), 0); + release_pages(pvec->pages, pagevec_count(pvec)); pagevec_reinit(pvec); } EXPORT_SYMBOL(__pagevec_release); diff --git a/mm/swap_state.c b/mm/swap_state.c index 374d446f7a0a..39ae7cfad90f 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -319,7 +319,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) lru_add_drain(); for (i = 0; i < nr; i++) free_swap_cache(pagep[i]); - release_pages(pagep, nr, false); + release_pages(pagep, nr); } /* From 2d4894b5d2ae0fe1725ea7abd57b33bfbbe45492 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:37:59 -0800 Subject: [PATCH 109/131] mm: remove cold parameter from free_hot_cold_page* Most callers users of free_hot_cold_page claim the pages being released are cache hot. The exception is the page reclaim paths where it is likely that enough pages will be freed in the near future that the per-cpu lists are going to be recycled and the cache hotness information is lost. As no one really cares about the hotness of pages being released to the allocator, just ditch the parameter. The APIs are renamed to indicate that it's no longer about hot/cold pages. It should also be less confusing as there are subtle differences between them. __free_pages drops a reference and frees a page when the refcount reaches zero. free_hot_cold_page handled pages whose refcount was already zero which is non-obvious from the name. free_unref_page should be more obvious. No performance impact is expected as the overhead is marginal. The parameter is removed simply because it is a bit stupid to have a useless parameter copied everywhere. [mgorman@techsingularity.net: add pages to head, not tail] Link: http://lkml.kernel.org/r/20171019154321.qtpzaeftoyyw4iey@techsingularity.net Link: http://lkml.kernel.org/r/20171018075952.10627-8-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/mmu_context_book3s64.c | 2 +- arch/powerpc/mm/pgtable_64.c | 2 +- arch/sparc/mm/init_64.c | 2 +- arch/tile/mm/homecache.c | 2 +- include/linux/gfp.h | 4 ++-- include/trace/events/kmem.h | 11 ++++------ mm/page_alloc.c | 29 +++++++++++--------------- mm/rmap.c | 2 +- mm/swap.c | 4 ++-- mm/vmscan.c | 6 +++--- 10 files changed, 28 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_book3s64.c b/arch/powerpc/mm/mmu_context_book3s64.c index 05e15386d4cb..a7e998158f37 100644 --- a/arch/powerpc/mm/mmu_context_book3s64.c +++ b/arch/powerpc/mm/mmu_context_book3s64.c @@ -200,7 +200,7 @@ static void destroy_pagetable_page(struct mm_struct *mm) /* We allow PTE_FRAG_NR fragments from a PTE page */ if (page_ref_sub_and_test(page, PTE_FRAG_NR - count)) { pgtable_page_dtor(page); - free_hot_cold_page(page, 0); + free_unref_page(page); } } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index ac0717a90ca6..1ec3aee43624 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -404,7 +404,7 @@ void pte_fragment_free(unsigned long *table, int kernel) if (put_page_testzero(page)) { if (!kernel) pgtable_page_dtor(page); - free_hot_cold_page(page, 0); + free_unref_page(page); } } diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index 051f73401793..55ba62957e64 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -2939,7 +2939,7 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, if (!page) return NULL; if (!pgtable_page_ctor(page)) { - free_hot_cold_page(page, 0); + free_unref_page(page); return NULL; } return (pte_t *) page_address(page); diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index b51cc28acd0a..4432f31e8479 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -409,7 +409,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) if (put_page_testzero(page)) { homecache_change_page_home(page, order, PAGE_HOME_HASH); if (order == 0) { - free_hot_cold_page(page, false); + free_unref_page(page); } else { init_page_count(page); __free_pages(page, order); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index b041f94678de..f7e62d9096fe 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -530,8 +530,8 @@ void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_hot_cold_page(struct page *page, bool cold); -extern void free_hot_cold_page_list(struct list_head *list, bool cold); +extern void free_unref_page(struct page *page); +extern void free_unref_page_list(struct list_head *list); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index 285feeadac39..eb57e3037deb 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -172,24 +172,21 @@ TRACE_EVENT(mm_page_free, TRACE_EVENT(mm_page_free_batched, - TP_PROTO(struct page *page, int cold), + TP_PROTO(struct page *page), - TP_ARGS(page, cold), + TP_ARGS(page), TP_STRUCT__entry( __field( unsigned long, pfn ) - __field( int, cold ) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); - __entry->cold = cold; ), - TP_printk("page=%p pfn=%lu order=0 cold=%d", + TP_printk("page=%p pfn=%lu order=0", pfn_to_page(__entry->pfn), - __entry->pfn, - __entry->cold) + __entry->pfn) ); TRACE_EVENT(mm_page_alloc, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a3c4a1d513f..f265d37b3152 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2611,7 +2611,7 @@ void mark_free_pages(struct zone *zone) } #endif /* CONFIG_PM */ -static bool free_hot_cold_page_prepare(struct page *page, unsigned long pfn) +static bool free_unref_page_prepare(struct page *page, unsigned long pfn) { int migratetype; @@ -2623,8 +2623,7 @@ static bool free_hot_cold_page_prepare(struct page *page, unsigned long pfn) return true; } -static void free_hot_cold_page_commit(struct page *page, unsigned long pfn, - bool cold) +static void free_unref_page_commit(struct page *page, unsigned long pfn) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -2649,10 +2648,7 @@ static void free_hot_cold_page_commit(struct page *page, unsigned long pfn, } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (!cold) - list_add(&page->lru, &pcp->lists[migratetype]); - else - list_add_tail(&page->lru, &pcp->lists[migratetype]); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = READ_ONCE(pcp->batch); @@ -2663,25 +2659,24 @@ static void free_hot_cold_page_commit(struct page *page, unsigned long pfn, /* * Free a 0-order page - * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, bool cold) +void free_unref_page(struct page *page) { unsigned long flags; unsigned long pfn = page_to_pfn(page); - if (!free_hot_cold_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn)) return; local_irq_save(flags); - free_hot_cold_page_commit(page, pfn, cold); + free_unref_page_commit(page, pfn); local_irq_restore(flags); } /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, bool cold) +void free_unref_page_list(struct list_head *list) { struct page *page, *next; unsigned long flags, pfn; @@ -2689,7 +2684,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) /* Prepare pages for freeing */ list_for_each_entry_safe(page, next, list, lru) { pfn = page_to_pfn(page); - if (!free_hot_cold_page_prepare(page, pfn)) + if (!free_unref_page_prepare(page, pfn)) list_del(&page->lru); set_page_private(page, pfn); } @@ -2699,8 +2694,8 @@ void free_hot_cold_page_list(struct list_head *list, bool cold) unsigned long pfn = page_private(page); set_page_private(page, 0); - trace_mm_page_free_batched(page, cold); - free_hot_cold_page_commit(page, pfn, cold); + trace_mm_page_free_batched(page); + free_unref_page_commit(page, pfn); } local_irq_restore(flags); } @@ -4301,7 +4296,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } @@ -4359,7 +4354,7 @@ void __page_frag_cache_drain(struct page *page, unsigned int count) unsigned int order = compound_order(page); if (order == 0) - free_hot_cold_page(page, false); + free_unref_page(page); else __free_pages_ok(page, order); } diff --git a/mm/rmap.c b/mm/rmap.c index 6b5a0f219ac0..47db27f8049e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1321,7 +1321,7 @@ void page_remove_rmap(struct page *page, bool compound) * It would be tidy to reset the PageAnon mapping here, * but that might overwrite a racing page_add_anon_rmap * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, + * before us: so leave the reset to free_unref_page, * and remember that it's only reliable while mapped. * Leaving it set also helps swapoff to reinstate ptes * faster for those pages still in swapcache. diff --git a/mm/swap.c b/mm/swap.c index 29cf75f1a860..b480279c760c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -76,7 +76,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, false); + free_unref_page(page); } static void __put_compound_page(struct page *page) @@ -817,7 +817,7 @@ void release_pages(struct page **pages, int nr) spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags); mem_cgroup_uncharge_list(&pages_to_free); - free_hot_cold_page_list(&pages_to_free, 0); + free_unref_page_list(&pages_to_free); } EXPORT_SYMBOL(release_pages); diff --git a/mm/vmscan.c b/mm/vmscan.c index 2852b8c5a917..c02c850ea349 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1349,7 +1349,7 @@ keep: mem_cgroup_uncharge_list(&free_pages); try_to_unmap_flush(); - free_hot_cold_page_list(&free_pages, true); + free_unref_page_list(&free_pages); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1824,7 +1824,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&page_list); - free_hot_cold_page_list(&page_list, true); + free_unref_page_list(&page_list); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -2063,7 +2063,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_unlock_irq(&pgdat->lru_lock); mem_cgroup_uncharge_list(&l_hold); - free_hot_cold_page_list(&l_hold, true); + free_unref_page_list(&l_hold); trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate, nr_deactivate, nr_rotated, sc->priority, file); } From 453f85d43fa9ee243f0fc3ac4e1be45615301e3f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:03 -0800 Subject: [PATCH 110/131] mm: remove __GFP_COLD As the page free path makes no distinction between cache hot and cold pages, there is no real useful ordering of pages in the free list that allocation requests can take advantage of. Juding from the users of __GFP_COLD, it is likely that a number of them are the result of copying other sites instead of actually measuring the impact. Remove the __GFP_COLD parameter which simplifies a number of paths in the page allocator. This is potentially controversial but bear in mind that the size of the per-cpu pagelists versus modern cache sizes means that the whole per-cpu list can often fit in the L3 cache. Hence, there is only a potential benefit for microbenchmarks that alloc/free pages in a tight loop. It's even worse when THP is taken into account which has little or no chance of getting a cache-hot page as the per-cpu list is bypassed and the zeroing of multiple pages will thrash the cache anyway. The truncate microbenchmarks are not shown as this patch affects the allocation path and not the free path. A page fault microbenchmark was tested but it showed no sigificant difference which is not surprising given that the __GFP_COLD branches are a miniscule percentage of the fault path. Link: http://lkml.kernel.org/r/20171018075952.10627-9-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- drivers/net/ethernet/amd/xgbe/xgbe-desc.c | 2 +- .../net/ethernet/aquantia/atlantic/aq_ring.c | 3 +-- .../ethernet/cavium/liquidio/octeon_network.h | 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 5 ++--- .../ethernet/netronome/nfp/nfp_net_common.c | 4 ++-- drivers/net/ethernet/qlogic/qlge/qlge_main.c | 3 +-- drivers/net/ethernet/sfc/falcon/rx.c | 2 +- drivers/net/ethernet/sfc/rx.c | 2 +- .../net/ethernet/synopsys/dwc-xlgmac-desc.c | 2 +- drivers/net/ethernet/ti/netcp_core.c | 2 +- drivers/net/virtio_net.c | 1 - .../staging/lustre/lustre/mdc/mdc_request.c | 2 +- fs/cachefiles/rdwr.c | 6 ++---- include/linux/gfp.h | 5 ----- include/linux/pagemap.h | 8 +------- include/linux/skbuff.h | 2 +- include/linux/slab.h | 3 --- include/trace/events/mmflags.h | 1 - kernel/power/snapshot.c | 4 ++-- mm/filemap.c | 6 +++--- mm/page_alloc.c | 20 ++++++------------- mm/percpu-vm.c | 2 +- net/core/skbuff.c | 4 ++-- tools/perf/builtin-kmem.c | 1 - 25 files changed, 32 insertions(+), 62 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index c6bd5e24005d..fbbbd8b3eb45 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -517,7 +517,7 @@ static int ena_refill_rx_bufs(struct ena_ring *rx_ring, u32 num) rc = ena_alloc_rx_page(rx_ring, rx_info, - __GFP_COLD | GFP_ATOMIC | __GFP_COMP); + GFP_ATOMIC | __GFP_COMP); if (unlikely(rc < 0)) { netif_warn(rx_ring->adapter, rx_err, rx_ring->netdev, "failed to alloc buffer for rx queue %d\n", diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c index 45d92304068e..cc1e4f820e64 100644 --- a/drivers/net/ethernet/amd/xgbe/xgbe-desc.c +++ b/drivers/net/ethernet/amd/xgbe/xgbe-desc.c @@ -295,7 +295,7 @@ again: order = alloc_order; /* Try to obtain pages, decreasing order if necessary */ - gfp = GFP_ATOMIC | __GFP_COLD | __GFP_COMP | __GFP_NOWARN; + gfp = GFP_ATOMIC | __GFP_COMP | __GFP_NOWARN; while (order >= 0) { pages = alloc_pages_node(node, gfp, order); if (pages) diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c index 0654e0c76bc2..519ca6534b85 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_ring.c +++ b/drivers/net/ethernet/aquantia/atlantic/aq_ring.c @@ -304,8 +304,7 @@ int aq_ring_rx_fill(struct aq_ring_s *self) buff->flags = 0U; buff->len = AQ_CFG_RX_FRAME_MAX; - buff->page = alloc_pages(GFP_ATOMIC | __GFP_COLD | - __GFP_COMP, pages_order); + buff->page = alloc_pages(GFP_ATOMIC | __GFP_COMP, pages_order); if (!buff->page) { err = -ENOMEM; goto err_exit; diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_network.h b/drivers/net/ethernet/cavium/liquidio/octeon_network.h index 9e36319cead6..57853eead4b5 100644 --- a/drivers/net/ethernet/cavium/liquidio/octeon_network.h +++ b/drivers/net/ethernet/cavium/liquidio/octeon_network.h @@ -195,7 +195,7 @@ static inline void struct sk_buff *skb; struct octeon_skb_page_info *skb_pg_info; - page = alloc_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_page(GFP_ATOMIC); if (unlikely(!page)) return NULL; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index b97a55c827eb..ffead38cf5da 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -193,7 +193,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv) if (mlx4_en_prepare_rx_desc(priv, ring, ring->actual_size, - GFP_KERNEL | __GFP_COLD)) { + GFP_KERNEL)) { if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) { en_err(priv, "Failed to allocate enough rx buffers\n"); return -ENOMEM; @@ -552,8 +552,7 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv, do { if (mlx4_en_prepare_rx_desc(priv, ring, ring->prod & ring->size_mask, - GFP_ATOMIC | __GFP_COLD | - __GFP_MEMALLOC)) + GFP_ATOMIC | __GFP_MEMALLOC)) break; ring->prod++; } while (likely(--missing)); diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index e118b5f23996..ffb12dc13a5a 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -1185,7 +1185,7 @@ static void *nfp_net_rx_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr) } else { struct page *page; - page = alloc_page(GFP_KERNEL | __GFP_COLD); + page = alloc_page(GFP_KERNEL); frag = page ? page_address(page) : NULL; } if (!frag) { @@ -1212,7 +1212,7 @@ static void *nfp_net_napi_alloc_one(struct nfp_net_dp *dp, dma_addr_t *dma_addr) } else { struct page *page; - page = alloc_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_page(GFP_ATOMIC); frag = page ? page_address(page) : NULL; } if (!frag) { diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_main.c b/drivers/net/ethernet/qlogic/qlge/qlge_main.c index 29fea74bff2e..7b97a9969046 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_main.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_main.c @@ -1092,8 +1092,7 @@ static int ql_get_next_chunk(struct ql_adapter *qdev, struct rx_ring *rx_ring, { if (!rx_ring->pg_chunk.page) { u64 map; - rx_ring->pg_chunk.page = alloc_pages(__GFP_COLD | __GFP_COMP | - GFP_ATOMIC, + rx_ring->pg_chunk.page = alloc_pages(__GFP_COMP | GFP_ATOMIC, qdev->lbq_buf_order); if (unlikely(!rx_ring->pg_chunk.page)) { netif_err(qdev, drv, qdev->ndev, diff --git a/drivers/net/ethernet/sfc/falcon/rx.c b/drivers/net/ethernet/sfc/falcon/rx.c index 6a8406dc0c2b..91097aea6c41 100644 --- a/drivers/net/ethernet/sfc/falcon/rx.c +++ b/drivers/net/ethernet/sfc/falcon/rx.c @@ -163,7 +163,7 @@ static int ef4_init_rx_buffers(struct ef4_rx_queue *rx_queue, bool atomic) do { page = ef4_reuse_page(rx_queue); if (page == NULL) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | + page = alloc_pages(__GFP_COMP | (atomic ? GFP_ATOMIC : GFP_KERNEL), efx->rx_buffer_order); if (unlikely(page == NULL)) diff --git a/drivers/net/ethernet/sfc/rx.c b/drivers/net/ethernet/sfc/rx.c index 42443f434569..0004c50d3c83 100644 --- a/drivers/net/ethernet/sfc/rx.c +++ b/drivers/net/ethernet/sfc/rx.c @@ -163,7 +163,7 @@ static int efx_init_rx_buffers(struct efx_rx_queue *rx_queue, bool atomic) do { page = efx_reuse_page(rx_queue); if (page == NULL) { - page = alloc_pages(__GFP_COLD | __GFP_COMP | + page = alloc_pages(__GFP_COMP | (atomic ? GFP_ATOMIC : GFP_KERNEL), efx->rx_buffer_order); if (unlikely(page == NULL)) diff --git a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c index e9672b1f9968..031cf9c3435a 100644 --- a/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c +++ b/drivers/net/ethernet/synopsys/dwc-xlgmac-desc.c @@ -335,7 +335,7 @@ static int xlgmac_alloc_pages(struct xlgmac_pdata *pdata, dma_addr_t pages_dma; /* Try to obtain pages, decreasing order if necessary */ - gfp |= __GFP_COLD | __GFP_COMP | __GFP_NOWARN; + gfp |= __GFP_COMP | __GFP_NOWARN; while (order >= 0) { pages = alloc_pages(gfp, order); if (pages) diff --git a/drivers/net/ethernet/ti/netcp_core.c b/drivers/net/ethernet/ti/netcp_core.c index 437d36289786..50d2b76771b5 100644 --- a/drivers/net/ethernet/ti/netcp_core.c +++ b/drivers/net/ethernet/ti/netcp_core.c @@ -906,7 +906,7 @@ static int netcp_allocate_rx_buf(struct netcp_intf *netcp, int fdq) sw_data[0] = (u32)bufptr; } else { /* Allocate a secondary receive queue entry */ - page = alloc_page(GFP_ATOMIC | GFP_DMA | __GFP_COLD); + page = alloc_page(GFP_ATOMIC | GFP_DMA); if (unlikely(!page)) { dev_warn_ratelimited(netcp->ndev_dev, "Secondary page alloc failed\n"); goto fail; diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 511f8339fa96..5eec09d63fc0 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -988,7 +988,6 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, int err; bool oom; - gfp |= __GFP_COLD; do { if (vi->mergeable_rx_bufs) err = add_recvbuf_mergeable(vi, rq, gfp); diff --git a/drivers/staging/lustre/lustre/mdc/mdc_request.c b/drivers/staging/lustre/lustre/mdc/mdc_request.c index 9e538a59f09d..03e55bca4ada 100644 --- a/drivers/staging/lustre/lustre/mdc/mdc_request.c +++ b/drivers/staging/lustre/lustre/mdc/mdc_request.c @@ -1152,7 +1152,7 @@ static int mdc_read_page_remote(void *data, struct page *page0) } for (npages = 1; npages < max_pages; npages++) { - page = page_cache_alloc_cold(inode->i_mapping); + page = page_cache_alloc(inode->i_mapping); if (!page) break; page_pool[npages] = page; diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 23097cca2674..883bc7bb12c5 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -256,8 +256,7 @@ static int cachefiles_read_backing_file_one(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem_monitor; } @@ -493,8 +492,7 @@ static int cachefiles_read_backing_file(struct cachefiles_object *object, goto backing_page_already_present; if (!newpage) { - newpage = __page_cache_alloc(cachefiles_gfp | - __GFP_COLD); + newpage = __page_cache_alloc(cachefiles_gfp); if (!newpage) goto nomem; } diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f7e62d9096fe..1a4582b44d32 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,7 +24,6 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u -#define ___GFP_COLD 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -192,16 +191,12 @@ struct vm_area_struct; /* * Action modifiers * - * __GFP_COLD indicates that the caller does not expect to be used in the near - * future. Where possible, a cache-cold page will be returned. - * * __GFP_NOWARN suppresses allocation failure reports. * * __GFP_COMP address compound page metadata. * * __GFP_ZERO returns a zeroed page on success. */ -#define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4c6790bb7afb..34ce3ebf97d5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -234,15 +234,9 @@ static inline struct page *page_cache_alloc(struct address_space *x) return __page_cache_alloc(mapping_gfp_mask(x)); } -static inline struct page *page_cache_alloc_cold(struct address_space *x) -{ - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); -} - static inline gfp_t readahead_gfp_mask(struct address_space *x) { - return mapping_gfp_mask(x) | - __GFP_COLD | __GFP_NORETRY | __GFP_NOWARN; + return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index aa1341474916..7c46fd0b8b64 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2672,7 +2672,7 @@ static inline struct page *__dev_alloc_pages(gfp_t gfp_mask, * 4. __GFP_MEMALLOC is ignored if __GFP_NOMEMALLOC is set due to * code in gfp_to_alloc_flags that should be enforcing this. */ - gfp_mask |= __GFP_COLD | __GFP_COMP | __GFP_MEMALLOC; + gfp_mask |= __GFP_COMP | __GFP_MEMALLOC; return alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); } diff --git a/include/linux/slab.h b/include/linux/slab.h index 79f6532f8a0b..50697a1d6621 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -467,9 +467,6 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * - * %__GFP_COLD - Request cache-cold pages instead of - * trying to return cache-warm pages. - * * %__GFP_HIGH - This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL - Indicate that this allocation is in no way allowed to fail diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 72162f3a03fa..dbe1bb058c09 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -32,7 +32,6 @@ {(unsigned long)__GFP_ATOMIC, "__GFP_ATOMIC"}, \ {(unsigned long)__GFP_IO, "__GFP_IO"}, \ {(unsigned long)__GFP_FS, "__GFP_FS"}, \ - {(unsigned long)__GFP_COLD, "__GFP_COLD"}, \ {(unsigned long)__GFP_NOWARN, "__GFP_NOWARN"}, \ {(unsigned long)__GFP_RETRY_MAYFAIL, "__GFP_RETRY_MAYFAIL"}, \ {(unsigned long)__GFP_NOFAIL, "__GFP_NOFAIL"}, \ diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index a917a301e201..bce0464524d8 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1884,7 +1884,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) */ static inline int get_highmem_buffer(int safe_needed) { - buffer = get_image_page(GFP_ATOMIC | __GFP_COLD, safe_needed); + buffer = get_image_page(GFP_ATOMIC, safe_needed); return buffer ? 0 : -ENOMEM; } @@ -1945,7 +1945,7 @@ static int swsusp_alloc(struct memory_bitmap *copy_bm, while (nr_pages-- > 0) { struct page *page; - page = alloc_image_page(GFP_ATOMIC | __GFP_COLD); + page = alloc_image_page(GFP_ATOMIC); if (!page) goto err_out; memory_bm_set_bit(copy_bm, page_to_pfn(page)); diff --git a/mm/filemap.c b/mm/filemap.c index 90a9f261f85f..923fc2ebd74a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2272,7 +2272,7 @@ no_cached_page: * Ok, it wasn't cached, so we need to create a new * page.. */ - page = page_cache_alloc_cold(mapping); + page = page_cache_alloc(mapping); if (!page) { error = -ENOMEM; goto out; @@ -2384,7 +2384,7 @@ static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask) int ret; do { - page = __page_cache_alloc(gfp_mask|__GFP_COLD); + page = __page_cache_alloc(gfp_mask); if (!page) return -ENOMEM; @@ -2788,7 +2788,7 @@ static struct page *do_read_cache_page(struct address_space *mapping, repeat: page = find_get_page(mapping, index); if (!page) { - page = __page_cache_alloc(gfp | __GFP_COLD); + page = __page_cache_alloc(gfp); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, gfp); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f265d37b3152..370b64d03e3f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2336,7 +2336,7 @@ retry: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, bool cold) + int migratetype) { int i, alloced = 0; @@ -2358,10 +2358,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(!cold)) - list_add(&page->lru, list); - else - list_add_tail(&page->lru, list); + list_add(&page->lru, list); list = &page->lru; alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) @@ -2795,7 +2792,7 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) /* Remove page from the per-cpu list, caller must protect the list */ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, - bool cold, struct per_cpu_pages *pcp, + struct per_cpu_pages *pcp, struct list_head *list) { struct page *page; @@ -2804,16 +2801,12 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype, if (list_empty(list)) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, list, - migratetype, cold); + migratetype); if (unlikely(list_empty(list))) return NULL; } - if (cold) - page = list_last_entry(list, struct page, lru); - else - page = list_first_entry(list, struct page, lru); - + page = list_first_entry(list, struct page, lru); list_del(&page->lru); pcp->count--; } while (check_new_pcp(page)); @@ -2828,14 +2821,13 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, { struct per_cpu_pages *pcp; struct list_head *list; - bool cold = ((gfp_flags & __GFP_COLD) != 0); struct page *page; unsigned long flags; local_irq_save(flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; - page = __rmqueue_pcplist(zone, migratetype, cold, pcp, list); + page = __rmqueue_pcplist(zone, migratetype, pcp, list); if (page) { __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 15dab691ea70..9158e5a81391 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -81,7 +81,7 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, static int pcpu_alloc_pages(struct pcpu_chunk *chunk, struct page **pages, int page_start, int page_end) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 6cd057b41f34..9c68555bb906 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -353,7 +353,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) */ void *netdev_alloc_frag(unsigned int fragsz) { - return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __netdev_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(netdev_alloc_frag); @@ -366,7 +366,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) void *napi_alloc_frag(unsigned int fragsz) { - return __napi_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); + return __napi_alloc_frag(fragsz, GFP_ATOMIC); } EXPORT_SYMBOL(napi_alloc_frag); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index cbf70738ef5f..ae11e4c3516a 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -641,7 +641,6 @@ static const struct { { "__GFP_ATOMIC", "_A" }, { "__GFP_IO", "I" }, { "__GFP_FS", "F" }, - { "__GFP_COLD", "CO" }, { "__GFP_NOWARN", "NWR" }, { "__GFP_RETRY_MAYFAIL", "R" }, { "__GFP_NOFAIL", "NF" }, From 0fac3ba527f23219678c7c10c767e37d40127b51 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Nov 2017 17:38:07 -0800 Subject: [PATCH 111/131] mm, page_alloc: simplify list handling in rmqueue_bulk() rmqueue_bulk() fills an empty pcplist with pages from the free list. It tries to preserve increasing order by pfn to the caller, because it leads to better performance with some I/O controllers, as explained in commit e084b2d95e48 ("page-allocator: preserve PFN ordering when __GFP_COLD is set"). To preserve the order, it's sufficient to add pages to the tail of the list as they are retrieved. The current code instead adds to the head of the list, but then updates the list head pointer to the last added page, in each step. This does result in the same order, but is needlessly confusing and potentially wasteful, with no apparent benefit. This patch simplifies the code and adjusts comment accordingly. Link: http://lkml.kernel.org/r/f6505442-98a9-12e4-b2cd-0fa83874c159@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 370b64d03e3f..7ca668e946e5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2350,16 +2350,16 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, continue; /* - * Split buddy pages returned by expand() are received here - * in physical page order. The page is added to the callers and - * list and the list head then moves forward. From the callers - * perspective, the linked list is ordered by page number in - * some conditions. This is useful for IO devices that can - * merge IO requests if the physical pages are ordered - * properly. + * Split buddy pages returned by expand() are received here in + * physical page order. The page is added to the tail of + * caller's list. From the callers perspective, the linked list + * is ordered by page number under some conditions. This is + * useful for IO devices that can forward direction from the + * head, thus also in the physical page order. This is useful + * for IO devices that can merge IO requests if the physical + * pages are ordered properly. */ - list_add(&page->lru, list); - list = &page->lru; + list_add_tail(&page->lru, list); alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, From 7f0b5fb953e750a7410cc96c67a656d79db48bcb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 15 Nov 2017 17:38:10 -0800 Subject: [PATCH 112/131] mm, pagevec: rename pagevec drained field According to Vlastimil Babka, the drained field in pagevec is potentially misleading because it might be interpreted as draining this pagevec instead of the percpu lru pagevecs. Rename the field for clarity. Link: http://lkml.kernel.org/r/20171019093346.ylahzdpzmoriyf4v@techsingularity.net Signed-off-by: Mel Gorman Suggested-by: Vlastimil Babka Cc: Andi Kleen Cc: Dave Chinner Cc: Dave Hansen Cc: Jan Kara Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 4 ++-- mm/swap.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index eebefd209424..5fb6580f7f23 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -17,7 +17,7 @@ struct address_space; struct pagevec { unsigned long nr; - bool drained; + bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; @@ -53,7 +53,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec, static inline void pagevec_init(struct pagevec *pvec) { pvec->nr = 0; - pvec->drained = false; + pvec->percpu_pvec_drained = false; } static inline void pagevec_reinit(struct pagevec *pvec) diff --git a/mm/swap.c b/mm/swap.c index b480279c760c..38e1b6374a97 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -833,9 +833,9 @@ EXPORT_SYMBOL(release_pages); */ void __pagevec_release(struct pagevec *pvec) { - if (!pvec->drained) { + if (!pvec->percpu_pvec_drained) { lru_add_drain(); - pvec->drained = true; + pvec->percpu_pvec_drained = true; } release_pages(pvec->pages, pagevec_count(pvec)); pagevec_reinit(pvec); From 313674661925ee265f16570c893ea13cb9e00b82 Mon Sep 17 00:00:00 2001 From: Otto Ebeling Date: Wed, 15 Nov 2017 17:38:14 -0800 Subject: [PATCH 113/131] Unify migrate_pages and move_pages access checks Commit 197e7e521384 ("Sanitize 'move_pages()' permission checks") fixed a security issue I reported in the move_pages syscall, and made it so that you can't act on set-uid processes unless you have the CAP_SYS_PTRACE capability. Unify the access check logic of migrate_pages to match the new behavior of move_pages. We discussed this a bit in the security@ list and thought it'd be good for consistency even though there's no evident security impact. The NUMA node access checks are left intact and require CAP_SYS_NICE as before. Link: http://lkml.kernel.org/r/alpine.DEB.2.11.1710011830320.6333@lakka.kapsi.fi Signed-off-by: Otto Ebeling Acked-by: Michal Hocko Cc: Eric W. Biederman Cc: Willy Tarreau Cc: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a2af6d58a68f..dad166b736ba 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -85,6 +85,7 @@ #include #include #include +#include #include #include #include @@ -1365,7 +1366,6 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { - const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; @@ -1401,15 +1401,10 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, err = -EINVAL; /* - * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * Check if this process has the right to modify the specified process. + * Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; From 9a8ec03ed022b79e56dca820cf04debbb240c7b3 Mon Sep 17 00:00:00 2001 From: weiping zhang Date: Wed, 15 Nov 2017 17:38:18 -0800 Subject: [PATCH 114/131] shmem: convert shmem_init_inodecache() to void shmem_inode_cachep was created with SLAB_PANIC flag and shmem_init_inodecache() never returns non-zero, so convert this function to return void. Link: http://lkml.kernel.org/r/20170909124542.GA35224@bogon.didichuxing.com Signed-off-by: weiping zhang Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 7ea8b276ba8b..d6947d21f66c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3862,12 +3862,11 @@ static void shmem_init_inode(void *foo) inode_init_once(&info->vfs_inode); } -static int shmem_init_inodecache(void) +static void shmem_init_inodecache(void) { shmem_inode_cachep = kmem_cache_create("shmem_inode_cache", sizeof(struct shmem_inode_info), 0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode); - return 0; } static void shmem_destroy_inodecache(void) @@ -3991,9 +3990,7 @@ int __init shmem_init(void) if (shmem_inode_cachep) return 0; - error = shmem_init_inodecache(); - if (error) - goto out3; + shmem_init_inodecache(); error = register_filesystem(&shmem_fs_type); if (error) { @@ -4020,7 +4017,6 @@ out1: unregister_filesystem(&shmem_fs_type); out2: shmem_destroy_inodecache(); -out3: shm_mnt = ERR_PTR(error); return error; } From 4518085e127dff97e74f74a8780d7564e273bec8 Mon Sep 17 00:00:00 2001 From: Kemi Wang Date: Wed, 15 Nov 2017 17:38:22 -0800 Subject: [PATCH 115/131] mm, sysctl: make NUMA stats configurable This is the second step which introduces a tunable interface that allow numa stats configurable for optimizing zone_statistics(), as suggested by Dave Hansen and Ying Huang. ========================================================================= When page allocation performance becomes a bottleneck and you can tolerate some possible tool breakage and decreased numa counter precision, you can do: echo 0 > /proc/sys/vm/numa_stat In this case, numa counter update is ignored. We can see about *4.8%*(185->176) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench01 (single thread) and *8.1%*(343->315) drop of cpu cycles per single page allocation and reclaim on Jesper's page_bench03 (88 threads) running on a 2-Socket Broadwell-based server (88 threads, 126G memory). Benchmark link provided by Jesper D Brouer (increase loop times to 10000000): https://github.com/netoptimizer/prototype-kernel/tree/master/kernel/mm/bench ========================================================================= When page allocation performance is not a bottleneck and you want all tooling to work, you can do: echo 1 > /proc/sys/vm/numa_stat This is system default setting. Many thanks to Michal Hocko, Dave Hansen, Ying Huang and Vlastimil Babka for comments to help improve the original patch. [keescook@chromium.org: make sure mutex is a global static] Link: http://lkml.kernel.org/r/20171107213809.GA4314@beast Link: http://lkml.kernel.org/r/1508290927-8518-1-git-send-email-kemi.wang@intel.com Signed-off-by: Kemi Wang Signed-off-by: Kees Cook Reported-by: Jesper Dangaard Brouer Suggested-by: Dave Hansen Suggested-by: Ying Huang Acked-by: Vlastimil Babka Acked-by: Michal Hocko Cc: "Luis R . Rodriguez" Cc: Kees Cook Cc: Jonathan Corbet Cc: Mel Gorman Cc: Johannes Weiner Cc: Christopher Lameter Cc: Sebastian Andrzej Siewior Cc: Andrey Ryabinin Cc: Tim Chen Cc: Andi Kleen Cc: Aaron Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 16 +++++++++ include/linux/vmstat.h | 10 ++++++ kernel/sysctl.c | 9 +++++ mm/mempolicy.c | 3 ++ mm/page_alloc.c | 6 ++++ mm/vmstat.c | 71 +++++++++++++++++++++++++++++++++++++ 6 files changed, 115 insertions(+) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 3e579740b49f..055c8b3e1018 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -58,6 +58,7 @@ Currently, these files are in /proc/sys/vm: - percpu_pagelist_fraction - stat_interval - stat_refresh +- numa_stat - swappiness - user_reserve_kbytes - vfs_cache_pressure @@ -799,6 +800,21 @@ with no ill effects: errors and warnings on these stats are suppressed.) ============================================================== +numa_stat + +This interface allows runtime configuration of numa statistics. + +When page allocation performance becomes a bottleneck and you can tolerate +some possible tool breakage and decreased numa counter precision, you can +do: + echo 0 > /proc/sys/vm/numa_stat + +When page allocation performance is not a bottleneck and you want all +tooling to work, you can do: + echo 1 > /proc/sys/vm/numa_stat + +============================================================== + swappiness This control is used to define how aggressive the kernel will swap diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1e0cb72e0598..1779c9817b39 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -7,9 +7,19 @@ #include #include #include +#include extern int sysctl_stat_interval; +#ifdef CONFIG_NUMA +#define ENABLE_NUMA_STAT 1 +#define DISABLE_NUMA_STAT 0 +extern int sysctl_vm_numa_stat; +DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); +extern int sysctl_vm_numa_stat_handler(struct ctl_table *table, + int write, void __user *buffer, size_t *length, loff_t *ppos); +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 7638e2f7fff8..4a13a389e99b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1356,6 +1356,15 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &hugetlb_mempolicy_sysctl_handler, }, + { + .procname = "numa_stat", + .data = &sysctl_vm_numa_stat, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = sysctl_vm_numa_stat_handler, + .extra1 = &zero, + .extra2 = &one, + }, #endif { .procname = "hugetlb_shm_group", diff --git a/mm/mempolicy.c b/mm/mempolicy.c index dad166b736ba..4ce44d3ff03d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1915,6 +1915,9 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, struct page *page; page = __alloc_pages(gfp, order, nid); + /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return page; if (page && page_to_nid(page) == nid) { preempt_disable(); __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7ca668e946e5..67f523c4711a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -82,6 +82,8 @@ DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); #endif +DEFINE_STATIC_KEY_TRUE(vm_numa_stat_key); + #ifdef CONFIG_HAVE_MEMORYLESS_NODES /* * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. @@ -2777,6 +2779,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) #ifdef CONFIG_NUMA enum numa_stat_item local_stat = NUMA_LOCAL; + /* skip numa counters update if numa stats is disabled */ + if (!static_branch_likely(&vm_numa_stat_key)) + return; + if (z->node != numa_node_id()) local_stat = NUMA_OTHER; diff --git a/mm/vmstat.c b/mm/vmstat.c index 7d11554861e4..40b2db6db6b1 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -32,6 +32,77 @@ #define NUMA_STATS_THRESHOLD (U16_MAX - 2) +#ifdef CONFIG_NUMA +int sysctl_vm_numa_stat = ENABLE_NUMA_STAT; + +/* zero numa counters within a zone */ +static void zero_zone_numa_counters(struct zone *zone) +{ + int item, cpu; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) { + atomic_long_set(&zone->vm_numa_stat[item], 0); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item] + = 0; + } +} + +/* zero numa counters of all the populated zones */ +static void zero_zones_numa_counters(void) +{ + struct zone *zone; + + for_each_populated_zone(zone) + zero_zone_numa_counters(zone); +} + +/* zero global numa counters */ +static void zero_global_numa_counters(void) +{ + int item; + + for (item = 0; item < NR_VM_NUMA_STAT_ITEMS; item++) + atomic_long_set(&vm_numa_stat[item], 0); +} + +static void invalid_numa_statistics(void) +{ + zero_zones_numa_counters(); + zero_global_numa_counters(); +} + +static DEFINE_MUTEX(vm_numa_stat_lock); + +int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + int ret, oldval; + + mutex_lock(&vm_numa_stat_lock); + if (write) + oldval = sysctl_vm_numa_stat; + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); + if (ret || !write) + goto out; + + if (oldval == sysctl_vm_numa_stat) + goto out; + else if (sysctl_vm_numa_stat == ENABLE_NUMA_STAT) { + static_branch_enable(&vm_numa_stat_key); + pr_info("enable numa statistics\n"); + } else { + static_branch_disable(&vm_numa_stat_key); + invalid_numa_statistics(); + pr_info("disable numa statistics, and clear numa counters\n"); + } + +out: + mutex_unlock(&vm_numa_stat_lock); + return ret; +} +#endif + #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); From 72b03fcd5d515441d4aefcad01c1c4392c8099c9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 15 Nov 2017 17:38:26 -0800 Subject: [PATCH 116/131] mm: mlock: remove lru_add_drain_all() lru_add_drain_all() is not required by mlock() and it will drain everything that has been cached at the time mlock is called. And that is not really related to the memory which will be faulted in (and cached) and mlocked by the syscall itself. If anything lru_add_drain_all() should be called _after_ pages have been mlocked and faulted in but even that is not strictly needed because those pages would get to the appropriate LRUs lazily during the reclaim path. Moreover follow_page_pte (gup) will drain the local pcp LRU cache. On larger machines the overhead of lru_add_drain_all() in mlock() can be significant when mlocking data already in memory. We have observed high latency in mlock() due to lru_add_drain_all() when the users were mlocking in memory tmpfs files. [mhocko@suse.com: changelog fix] Link: http://lkml.kernel.org/r/20171019222507.2894-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Balbir Singh Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Joonsoo Kim Cc: Minchan Kim Cc: Yisheng Xie Cc: Ingo Molnar Cc: Greg Thelen Cc: Hugh Dickins Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index ed37cb208d19..30472d438794 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -670,8 +670,6 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla if (!can_do_mlock()) return -EPERM; - lru_add_drain_all(); /* flush pagevec */ - len = PAGE_ALIGN(len + (offset_in_page(start))); start &= PAGE_MASK; @@ -798,9 +796,6 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!can_do_mlock()) return -EPERM; - if (flags & MCL_CURRENT) - lru_add_drain_all(); /* flush pagevec */ - lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; From b050e3769c6b4013bb937e879fc43bf1847ee819 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 15 Nov 2017 17:38:30 -0800 Subject: [PATCH 117/131] mm, page_alloc: fix potential false positive in __zone_watermark_ok Since commit 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations"), __zone_watermark_ok() check for high-order allocations will shortcut per-migratetype free list checks for ALLOC_HARDER allocations, and return true as long as there's free page of any migratetype. The intention is that ALLOC_HARDER can allocate from MIGRATE_HIGHATOMIC free lists, while normal allocations can't. However, as a side effect, the watermark check will then also return true when there are pages only on the MIGRATE_ISOLATE list, or (prior to CMA conversion to ZONE_MOVABLE) on the MIGRATE_CMA list. Since the allocation cannot actually obtain isolated pages, and might not be able to obtain CMA pages, this can result in a false positive. The condition should be rare and perhaps the outcome is not a fatal one. Still, it's better if the watermark check is correct. There also shouldn't be a performance tradeoff here. Link: http://lkml.kernel.org/r/20171102125001.23708-1-vbabka@suse.cz Fixes: 97a16fc82a7c ("mm, page_alloc: only enforce watermarks for order-0 allocations") Signed-off-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Rik van Riel Cc: David Rientjes Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 67f523c4711a..04bf1ad50144 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3041,9 +3041,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, if (!area->nr_free) continue; - if (alloc_harder) - return true; - for (mt = 0; mt < MIGRATE_PCPTYPES; mt++) { if (!list_empty(&area->free_list[mt])) return true; @@ -3055,6 +3052,9 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, return true; } #endif + if (alloc_harder && + !list_empty(&area->free_list[MIGRATE_HIGHATOMIC])) + return true; } return false; } From df206988e03e8510e08b2ee77f3e65010ed97768 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 15 Nov 2017 17:38:34 -0800 Subject: [PATCH 118/131] fs: fuse: account fuse_inode slab memory as reclaimable Fuse inodes are currently included in the unreclaimable slab counts - SUnreclaim in /proc/meminfo, slab_unreclaimable in /proc/vmstat and the per-cgroup memory.stat. But they are reclaimable just like other filesystems' inodes, and /proc/sys/vm/drop_caches frees them easily. Mark the slab cache reclaimable. Link: http://lkml.kernel.org/r/20171102202727.12539-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fuse/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 94a745acaef8..5c0f0d689fb2 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1273,9 +1273,9 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, - fuse_inode_init_once); + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT|SLAB_RECLAIM_ACCOUNT, + fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) goto out; From 400e22499dd92613821374c8c6c88c7225359980 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 15 Nov 2017 17:38:37 -0800 Subject: [PATCH 119/131] mm: don't warn about allocations which stall for too long Commit 63f53dea0c98 ("mm: warn about allocations which stall for too long") was a great step for reducing possibility of silent hang up problem caused by memory allocation stalls. But this commit reverts it, for it is possible to trigger OOM lockup and/or soft lockups when many threads concurrently called warn_alloc() (in order to warn about memory allocation stalls) due to current implementation of printk(), and it is difficult to obtain useful information due to limitation of synchronous warning approach. Current printk() implementation flushes all pending logs using the context of a thread which called console_unlock(). printk() should be able to flush all pending logs eventually unless somebody continues appending to printk() buffer. Since warn_alloc() started appending to printk() buffer while waiting for oom_kill_process() to make forward progress when oom_kill_process() is processing pending logs, it became possible for warn_alloc() to force oom_kill_process() loop inside printk(). As a result, warn_alloc() significantly increased possibility of preventing oom_kill_process() from making forward progress. ---------- Pseudo code start ---------- Before warn_alloc() was introduced: retry: if (mutex_trylock(&oom_lock)) { while (atomic_read(&printk_pending_logs) > 0) { atomic_dec(&printk_pending_logs); print_one_log(); } // Send SIGKILL here. mutex_unlock(&oom_lock) } goto retry; After warn_alloc() was introduced: retry: if (mutex_trylock(&oom_lock)) { while (atomic_read(&printk_pending_logs) > 0) { atomic_dec(&printk_pending_logs); print_one_log(); } // Send SIGKILL here. mutex_unlock(&oom_lock) } else if (waited_for_10seconds()) { atomic_inc(&printk_pending_logs); } goto retry; ---------- Pseudo code end ---------- Although waited_for_10seconds() becomes true once per 10 seconds, unbounded number of threads can call waited_for_10seconds() at the same time. Also, since threads doing waited_for_10seconds() keep doing almost busy loop, the thread doing print_one_log() can use little CPU resource. Therefore, this situation can be simplified like ---------- Pseudo code start ---------- retry: if (mutex_trylock(&oom_lock)) { while (atomic_read(&printk_pending_logs) > 0) { atomic_dec(&printk_pending_logs); print_one_log(); } // Send SIGKILL here. mutex_unlock(&oom_lock) } else { atomic_inc(&printk_pending_logs); } goto retry; ---------- Pseudo code end ---------- when printk() is called faster than print_one_log() can process a log. One of possible mitigation would be to introduce a new lock in order to make sure that no other series of printk() (either oom_kill_process() or warn_alloc()) can append to printk() buffer when one series of printk() (either oom_kill_process() or warn_alloc()) is already in progress. Such serialization will also help obtaining kernel messages in readable form. ---------- Pseudo code start ---------- retry: if (mutex_trylock(&oom_lock)) { mutex_lock(&oom_printk_lock); while (atomic_read(&printk_pending_logs) > 0) { atomic_dec(&printk_pending_logs); print_one_log(); } // Send SIGKILL here. mutex_unlock(&oom_printk_lock); mutex_unlock(&oom_lock) } else { if (mutex_trylock(&oom_printk_lock)) { atomic_inc(&printk_pending_logs); mutex_unlock(&oom_printk_lock); } } goto retry; ---------- Pseudo code end ---------- But this commit does not go that direction, for we don't want to introduce a new lock dependency, and we unlikely be able to obtain useful information even if we serialized oom_kill_process() and warn_alloc(). Synchronous approach is prone to unexpected results (e.g. too late [1], too frequent [2], overlooked [3]). As far as I know, warn_alloc() never helped with providing information other than "something is going wrong". I want to consider asynchronous approach which can obtain information during stalls with possibly relevant threads (e.g. the owner of oom_lock and kswapd-like threads) and serve as a trigger for actions (e.g. turn on/off tracepoints, ask libvirt daemon to take a memory dump of stalling KVM guest for diagnostic purpose). This commit temporarily loses ability to report e.g. OOM lockup due to unable to invoke the OOM killer due to !__GFP_FS allocation request. But asynchronous approach will be able to detect such situation and emit warning. Thus, let's remove warn_alloc(). [1] https://bugzilla.kernel.org/show_bug.cgi?id=192981 [2] http://lkml.kernel.org/r/CAM_iQpWuPVGc2ky8M-9yukECtS+zKjiDasNymX7rMcBjBFyM_A@mail.gmail.com [3] commit db73ee0d46379922 ("mm, vmscan: do not loop on too_many_isolated for ever")) Link: http://lkml.kernel.org/r/1509017339-4802-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: Cong Wang Reported-by: yuwang.yuwang Reported-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Vlastimil Babka Cc: Mel Gorman Cc: Dave Hansen Cc: Sergey Senozhatsky Cc: Petr Mladek Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 04bf1ad50144..bd1a686e40fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3903,8 +3903,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, enum compact_result compact_result; int compaction_retries; int no_progress_loops; - unsigned long alloc_start = jiffies; - unsigned int stall_timeout = 10 * HZ; unsigned int cpuset_mems_cookie; int reserve_flags; @@ -4036,14 +4034,6 @@ retry: if (!can_direct_reclaim) goto nopage; - /* Make sure we know about allocations which stall for too long */ - if (time_after(jiffies, alloc_start + stall_timeout)) { - warn_alloc(gfp_mask & ~__GFP_NOWARN, ac->nodemask, - "page allocation stalls for %ums, order:%u", - jiffies_to_msecs(jiffies-alloc_start), order); - stall_timeout += 10 * HZ; - } - /* Avoid recursion of direct reclaim */ if (current->flags & PF_MEMALLOC) goto nopage; From d135e5750205a21a212a19dbb05aeb339e2cbea7 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 15 Nov 2017 17:38:41 -0800 Subject: [PATCH 120/131] mm/page_alloc.c: broken deferred calculation In reset_deferred_meminit() we determine number of pages that must not be deferred. We initialize pages for at least 2G of memory, but also pages for reserved memory in this node. The reserved memory is determined in this function: memblock_reserved_memory_within(), which operates over physical addresses, and returns size in bytes. However, reset_deferred_meminit() assumes that that this function operates with pfns, and returns page count. The result is that in the best case machine boots slower than expected due to initializing more pages than needed in single thread, and in the worst case panics because fewer than needed pages are initialized early. Link: http://lkml.kernel.org/r/20171021011707.15191-1-pasha.tatashin@oracle.com Fixes: 864b9a393dcb ("mm: consider memblock reservations for deferred memory initialization sizing") Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 27 ++++++++++++++++++--------- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39ccaa9c428b..67f2e3c38939 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -700,7 +700,8 @@ typedef struct pglist_data { * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; - unsigned long static_init_size; + /* Number of non-deferred pages */ + unsigned long static_init_pgcnt; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd1a686e40fe..8f2b9ad2e23f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -291,28 +291,37 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT + +/* + * Determine how many pages need to be initialized durig early boot + * (non-deferred initialization). + * The value of first_deferred_pfn will be set later, once non-deferred pages + * are initialized, but for now set it ULONG_MAX. + */ static inline void reset_deferred_meminit(pg_data_t *pgdat) { - unsigned long max_initialise; - unsigned long reserved_lowmem; + phys_addr_t start_addr, end_addr; + unsigned long max_pgcnt; + unsigned long reserved; /* * Initialise at least 2G of a node but also take into account that * two large system hashes that can take up 1GB for 0.25TB/node. */ - max_initialise = max(2UL << (30 - PAGE_SHIFT), - (pgdat->node_spanned_pages >> 8)); + max_pgcnt = max(2UL << (30 - PAGE_SHIFT), + (pgdat->node_spanned_pages >> 8)); /* * Compensate the all the memblock reservations (e.g. crash kernel) * from the initial estimation to make sure we will initialize enough * memory to boot. */ - reserved_lowmem = memblock_reserved_memory_within(pgdat->node_start_pfn, - pgdat->node_start_pfn + max_initialise); - max_initialise += reserved_lowmem; + start_addr = PFN_PHYS(pgdat->node_start_pfn); + end_addr = PFN_PHYS(pgdat->node_start_pfn + max_pgcnt); + reserved = memblock_reserved_memory_within(start_addr, end_addr); + max_pgcnt += PHYS_PFN(reserved); - pgdat->static_init_size = min(max_initialise, pgdat->node_spanned_pages); + pgdat->static_init_pgcnt = min(max_pgcnt, pgdat->node_spanned_pages); pgdat->first_deferred_pfn = ULONG_MAX; } @@ -339,7 +348,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; - if ((*nr_initialised > pgdat->static_init_size) && + if ((*nr_initialised > pgdat->static_init_pgcnt) && (pfn & (PAGES_PER_SECTION - 1)) == 0) { pgdat->first_deferred_pfn = pfn; return false; From c8402871d54a8e00016e040c1b8f5d31e96fcd94 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 15 Nov 2017 17:38:45 -0800 Subject: [PATCH 121/131] mm/shmem.c: mark expected switch fall-through In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Link: http://lkml.kernel.org/r/20171020191058.GA24427@embeddedor.com Signed-off-by: Gustavo A. R. Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/shmem.c b/mm/shmem.c index d6947d21f66c..ab22eaa2412e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4098,6 +4098,7 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) if (i_size >= HPAGE_PMD_SIZE && i_size >> PAGE_SHIFT >= off) return true; + /* fall through */ case SHMEM_HUGE_ADVISE: /* TODO: implement fadvise() hints */ return (vma->vm_flags & VM_HUGEPAGE); From 5b568acc3c2328f7d8da3cb03b4ef343f93545c6 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Wed, 15 Nov 2017 17:38:49 -0800 Subject: [PATCH 122/131] mm/list_lru.c: mark expected switch fall-through In preparation for enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Link: http://lkml.kernel.org/r/20171020190754.GA24332@embeddedor.com Signed-off-by: Gustavo A. R. Silva Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/list_lru.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/list_lru.c b/mm/list_lru.c index f141f0c80ff3..fd41e969ede5 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -221,6 +221,7 @@ restart: switch (ret) { case LRU_REMOVED_RETRY: assert_spin_locked(&nlru->lock); + /* fall through */ case LRU_REMOVED: isolated++; nlru->nr_items--; From fec11bc0396bbd82b152e6ce9a47483ffd69462a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 15 Nov 2017 17:38:52 -0800 Subject: [PATCH 123/131] mm/hmm: remove redundant variable align_end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Variable align_end is assigned a value but it is never read, so the variable is redundant and can be removed. Cleans up the clang warning: Value stored to 'align_end' is never read Link: http://lkml.kernel.org/r/20171017143837.23207-1-colin.king@canonical.com Signed-off-by: Colin Ian King Reviewed-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index a88a847bccba..ea19742a5d60 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -803,11 +803,10 @@ static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); static void hmm_devmem_radix_release(struct resource *resource) { - resource_size_t key, align_start, align_size, align_end; + resource_size_t key, align_start, align_size; align_start = resource->start & ~(PA_SECTION_SIZE - 1); align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); - align_end = align_start + align_size - 1; mutex_lock(&hmm_devmem_lock); for (key = resource->start; From fcdaf842bd8f538a88059ce0243bc2822ed1b0e0 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:38:56 -0800 Subject: [PATCH 124/131] mm, sparse: do not swamp log with huge vmemmap allocation failures While doing memory hotplug tests under heavy memory pressure we have noticed too many page allocation failures when allocating vmemmap memmap backed by huge page kworker/u3072:1: page allocation failure: order:9, mode:0x24084c0(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO) [...] Call Trace: dump_trace+0x59/0x310 show_stack_log_lvl+0xea/0x170 show_stack+0x21/0x40 dump_stack+0x5c/0x7c warn_alloc_failed+0xe2/0x150 __alloc_pages_nodemask+0x3ed/0xb20 alloc_pages_current+0x7f/0x100 vmemmap_alloc_block+0x79/0xb6 __vmemmap_alloc_block_buf+0x136/0x145 vmemmap_populate+0xd2/0x2b9 sparse_mem_map_populate+0x23/0x30 sparse_add_one_section+0x68/0x18e __add_pages+0x10a/0x1d0 arch_add_memory+0x4a/0xc0 add_memory_resource+0x89/0x160 add_memory+0x6d/0xd0 acpi_memory_device_add+0x181/0x251 acpi_bus_attach+0xfd/0x19b acpi_bus_scan+0x59/0x69 acpi_device_hotplug+0xd2/0x41f acpi_hotplug_work_fn+0x1a/0x23 process_one_work+0x14e/0x410 worker_thread+0x116/0x490 kthread+0xbd/0xe0 ret_from_fork+0x3f/0x70 and we do see many of those because essentially every allocation fails for each memory section. This is an excessive way to tell the user that there is nothing to really worry about because we do have a fallback mechanism to use base pages. The only downside might be a performance degradation due to TLB pressure. This patch changes vmemmap_alloc_block() to use __GFP_NOWARN and warn explicitly once on the first allocation failure. This will reduce the noise in the kernel log considerably, while we still have an indication that a performance might be impacted. [mhocko@kernel.org: forgot to git add the follow up fix] Link: http://lkml.kernel.org/r/20171107090635.c27thtse2lchjgvb@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20171106092228.31098-1-mhocko@kernel.org Signed-off-by: Johannes Weiner Signed-off-by: Michal Hocko Cc: Joe Perches Cc: Vlastimil Babka Cc: Khalid Aziz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/init_64.c | 1 - mm/sparse-vmemmap.c | 12 ++++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index c3fc544b50d2..4a837289f2ad 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1405,7 +1405,6 @@ static int __meminit vmemmap_populate_hugepages(unsigned long start, vmemmap_verify((pte_t *)pmd, node, addr, next); continue; } - pr_warn_once("vmemmap: falling back to regular page backing\n"); if (vmemmap_populate_basepages(addr, next, node)) return -ENOMEM; } diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index 4e49762599c8..17acf01791fa 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -53,12 +53,20 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { + gfp_t gfp_mask = GFP_KERNEL|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + int order = get_order(size); + static bool warned; struct page *page; - page = alloc_pages_node(node, GFP_KERNEL | __GFP_RETRY_MAYFAIL, - get_order(size)); + page = alloc_pages_node(node, gfp_mask, order); if (page) return page_address(page); + + if (!warned) { + warn_alloc(gfp_mask & ~__GFP_NOWARN, NULL, + "vmemmap alloc failure: order:%u", order); + warned = true; + } return NULL; } else return __earlyonly_bootmem_alloc(node, size, size, From 0a7f682d04652fd91de0cbc1ea3d7aa1e45611c7 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:38:59 -0800 Subject: [PATCH 125/131] mm: do not rely on preempt_count in print_vma_addr The preempt count check on print_vma_addr has been added by commit e8bff74afbdb ("x86: fix "BUG: sleeping function called from invalid context" in print_vma_addr()") and it relied on the elevated preempt count from preempt_conditional_sti because preempt_count check doesn't work on non preemptive kernels by default. The code has evolved though and commit d99e1bd175f4 ("x86/entry/traps: Refactor preemption and interrupt flag handling") has replaced preempt_conditional_sti by an explicit preempt_disable which is noop on !PREEMPT so the check in print_vma_addr is broken. Fix the issue by using trylock on mmap_sem rather than chacking the preempt count. The allocation we are relying on has to be GFP_NOWAIT as well. There is a chance that we won't dump the vma state if the lock is contended or the memory short but this is acceptable outcome and much less fragile than the not working preemption check or tricks around it. Link: http://lkml.kernel.org/r/20171106134031.g6dbelg55mrbyc6i@dhcp22.suse.cz Fixes: d99e1bd175f4 ("x86/entry/traps: Refactor preemption and interrupt flag handling") Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Acked-by: Yang Shi Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 6dec21b182b0..85e7a87da79f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4485,17 +4485,15 @@ void print_vma_addr(char *prefix, unsigned long ip) struct vm_area_struct *vma; /* - * Do not print if we are in atomic - * contexts (in exception stacks, etc.): + * we might be running from an atomic context so we cannot sleep */ - if (preempt_count()) + if (!down_read_trylock(&mm->mmap_sem)) return; - down_read(&mm->mmap_sem); vma = find_vma(mm, ip); if (vma && vma->vm_file) { struct file *f = vma->vm_file; - char *buf = (char *)__get_free_page(GFP_KERNEL); + char *buf = (char *)__get_free_page(GFP_NOWAIT); if (buf) { char *p; From 2bce774e8245e95db81872ec39522cde8b486fc8 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Wed, 15 Nov 2017 17:39:03 -0800 Subject: [PATCH 126/131] writeback: remove unused function parameter The parameter `struct bdi_writeback *wb` is not been used in the function body. Remove it. Link: http://lkml.kernel.org/r/1509685485-15278-1-git-send-email-wanglong19@meituan.com Signed-off-by: Wang Long Reviewed-by: Jan Kara Acked-by: Tejun Heo Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev.h | 2 +- mm/page-writeback.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 7360e79e9a2f..e54e7e0033eb 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -93,7 +93,7 @@ extern void wb_writeout_inc(struct bdi_writeback *wb); /* * maximal error of a stat counter. */ -static inline unsigned long wb_stat_error(struct bdi_writeback *wb) +static inline unsigned long wb_stat_error(void) { #ifdef CONFIG_SMP return nr_cpu_ids * WB_STAT_BATCH; diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 05313f402ba8..8a1551154285 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1545,7 +1545,7 @@ static inline void wb_dirty_limits(struct dirty_throttle_control *dtc) * actually dirty; with m+n sitting in the percpu * deltas. */ - if (dtc->wb_thresh < 2 * wb_stat_error(wb)) { + if (dtc->wb_thresh < 2 * wb_stat_error()) { wb_reclaimable = wb_stat_sum(wb, WB_RECLAIMABLE); dtc->wb_dirty = wb_reclaimable + wb_stat_sum(wb, WB_WRITEBACK); } else { @@ -1803,7 +1803,7 @@ pause: * more page. However wb_dirty has accounting errors. So use * the larger and more IO friendly wb_stat_error. */ - if (sdtc->wb_dirty <= wb_stat_error(wb)) + if (sdtc->wb_dirty <= wb_stat_error()) break; if (fatal_signal_pending(current)) From e492080e640c2d1235ddf3441cae634cfffef7e1 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Wed, 15 Nov 2017 17:39:07 -0800 Subject: [PATCH 127/131] mm/page_ext.c: check if page_ext is not prepared online_page_ext() and page_ext_init() allocate page_ext for each section, but they do not allocate if the first PFN is !pfn_present(pfn) or !pfn_valid(pfn). Then section->page_ext remains as NULL. lookup_page_ext checks NULL only if CONFIG_DEBUG_VM is enabled. For a valid PFN, __set_page_owner will try to get page_ext through lookup_page_ext. Without CONFIG_DEBUG_VM lookup_page_ext will misuse NULL pointer as value 0. This incurrs invalid address access. This is the panic example when PFN 0x100000 is not valid but PFN 0x13FC00 is being used for page_ext. section->page_ext is NULL, get_entry returned invalid page_ext address as 0x1DFA000 for a PFN 0x13FC00. To avoid this panic, CONFIG_DEBUG_VM should be removed so that page_ext will be checked at all times. Unable to handle kernel paging request at virtual address 01dfa014 ------------[ cut here ]------------ Kernel BUG at ffffff80082371e0 [verbose debug info unavailable] Internal error: Oops: 96000045 [#1] PREEMPT SMP Modules linked in: PC is at __set_page_owner+0x48/0x78 LR is at __set_page_owner+0x44/0x78 __set_page_owner+0x48/0x78 get_page_from_freelist+0x880/0x8e8 __alloc_pages_nodemask+0x14c/0xc48 __do_page_cache_readahead+0xdc/0x264 filemap_fault+0x2ac/0x550 ext4_filemap_fault+0x3c/0x58 __do_fault+0x80/0x120 handle_mm_fault+0x704/0xbb0 do_page_fault+0x2e8/0x394 do_mem_abort+0x88/0x124 Pre-4.7 kernels also need commit f86e4271978b ("mm: check the return value of lookup_page_ext for all call sites"). Link: http://lkml.kernel.org/r/20171107094131.14621-1-jaewon31.kim@samsung.com Fixes: eefa864b701d ("mm/page_ext: resurrect struct page extending code for debugging") Signed-off-by: Jaewon Kim Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Minchan Kim Cc: Joonsoo Kim Cc: [depends on f86e427197, see above] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/page_ext.c b/mm/page_ext.c index 4f0367d472c4..2c16216c29b6 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -125,7 +125,6 @@ struct page_ext *lookup_page_ext(struct page *page) struct page_ext *base; base = NODE_DATA(page_to_nid(page))->node_page_ext; -#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -134,7 +133,6 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (unlikely(!base)) return NULL; -#endif index = pfn - round_down(node_start_pfn(page_to_nid(page)), MAX_ORDER_NR_PAGES); return get_entry(base, index); @@ -199,7 +197,6 @@ struct page_ext *lookup_page_ext(struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); -#if defined(CONFIG_DEBUG_VM) /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are @@ -208,7 +205,6 @@ struct page_ext *lookup_page_ext(struct page *page) */ if (!section->page_ext) return NULL; -#endif return get_entry(section->page_ext, pfn); } From c50842c8e1cddcdb69d3ece4f4df005a0e6c5ceb Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 15 Nov 2017 17:39:10 -0800 Subject: [PATCH 128/131] mm,oom_reaper: remove pointless kthread_run() error check Since oom_init() is called before userspace processes start, memory allocation failure for creating the OOM reaper kernel thread will let the OOM killer call panic() rather than wake up the OOM reaper. Link: http://lkml.kernel.org/r/1510137800-4602-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 26add8a0d1f7..a501a0a1f0f8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -640,9 +640,6 @@ static int oom_reaper(void *unused) static void wake_oom_reaper(struct task_struct *tsk) { - if (!oom_reaper_th) - return; - /* tsk is already queued? */ if (tsk == oom_reaper_list || tsk->oom_reaper_list) return; @@ -660,11 +657,6 @@ static void wake_oom_reaper(struct task_struct *tsk) static int __init oom_init(void) { oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); - if (IS_ERR(oom_reaper_th)) { - pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", - PTR_ERR(oom_reaper_th)); - oom_reaper_th = NULL; - } return 0; } subsys_initcall(oom_init) From 0205f75571e3a70c35f0dd5e608773cce97d9dbb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 15 Nov 2017 17:39:14 -0800 Subject: [PATCH 129/131] mm: simplify nodemask printing alloc_warn() and dump_header() have to explicitly handle NULL nodemask which forces both paths to use pr_cont. We can do better. printk already handles NULL pointers properly so all we need is to teach nodemask_pr_args to handle NULL nodemask carefully. This allows simplification of both alloc_warn() and dump_header() and gets rid of pr_cont altogether. This patch has been motivated by patch from Joe Perches http://lkml.kernel.org/r/b31236dfe3fc924054fd7842bde678e71d193638.1509991345.git.joe@perches.com [akpm@linux-foundation.org: fix tile warning, per Arnd] Link: http://lkml.kernel.org/r/20171109100531.3cn2hcqnuj7mjaju@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Joe Perches Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nodemask.h | 4 +++- mm/oom_kill.c | 12 ++++-------- mm/page_alloc.c | 12 +++--------- 3 files changed, 10 insertions(+), 18 deletions(-) diff --git a/include/linux/nodemask.h b/include/linux/nodemask.h index de1c50b93c61..15cab3967d6d 100644 --- a/include/linux/nodemask.h +++ b/include/linux/nodemask.h @@ -104,7 +104,9 @@ extern nodemask_t _unused_nodemask_arg_; * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ -#define nodemask_pr_args(maskp) MAX_NUMNODES, (maskp)->bits +#define nodemask_pr_args(maskp) \ + ((maskp) != NULL) ? MAX_NUMNODES : 0, \ + ((maskp) != NULL) ? (maskp)->bits : NULL /* * The inline keyword gives the compiler room to decide to inline, or diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a501a0a1f0f8..c86fbd1b590e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -426,14 +426,10 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) static void dump_header(struct oom_control *oc, struct task_struct *p) { - pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=", - current->comm, oc->gfp_mask, &oc->gfp_mask); - if (oc->nodemask) - pr_cont("%*pbl", nodemask_pr_args(oc->nodemask)); - else - pr_cont("(null)"); - pr_cont(", order=%d, oom_score_adj=%hd\n", - oc->order, current->signal->oom_score_adj); + pr_warn("%s invoked oom-killer: gfp_mask=%#x(%pGg), nodemask=%*pbl, order=%d, oom_score_adj=%hd\n", + current->comm, oc->gfp_mask, &oc->gfp_mask, + nodemask_pr_args(oc->nodemask), oc->order, + current->signal->oom_score_adj); if (!IS_ENABLED(CONFIG_COMPACTION) && oc->order) pr_warn("COMPACTION is disabled!!!\n"); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f2b9ad2e23f..7a199767dcee 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3279,20 +3279,14 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) return; - pr_warn("%s: ", current->comm); - va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - pr_cont("%pV", &vaf); + pr_warn("%s: %pV, mode:%#x(%pGg), nodemask=%*pbl\n", + current->comm, &vaf, gfp_mask, &gfp_mask, + nodemask_pr_args(nodemask)); va_end(args); - pr_cont(", mode:%#x(%pGg), nodemask=", gfp_mask, &gfp_mask); - if (nodemask) - pr_cont("%*pbl\n", nodemask_pr_args(nodemask)); - else - pr_cont("(null)\n"); - cpuset_print_current_mems_allowed(); dump_stack(); From 0cd842f97069c68718bef21ad2dc96a0578567ec Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 15 Nov 2017 17:39:18 -0800 Subject: [PATCH 130/131] mm: make alloc_node_mem_map a void call if we don't have CONFIG_FLAT_NODE_MEM_MAP free_area_init_node() calls alloc_node_mem_map(), but this function does nothing unless we have CONFIG_FLAT_NODE_MEM_MAP. As a cleanup, we can move the "#ifdef CONFIG_FLAT_NODE_MEM_MAP" within alloc_node_mem_map() out of the function, and define a alloc_node_mem_map() { } when CONFIG_FLAT_NODE_MEM_MAP is not present. This also moves the printk that lays within the "#ifdef CONFIG_FLAT_NODE_MEM_MAP" block from free_area_init_node() to alloc_node_mem_map(), getting rid of the "#ifdef CONFIG_FLAT_NODE_MEM_MAP" in free_area_init_node(). [akpm@linux-foundation.org: clean up the printk while we're there] Link: http://lkml.kernel.org/r/20171114111935.GA11758@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7a199767dcee..55ded92f9809 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6151,6 +6151,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) } } +#ifdef CONFIG_FLAT_NODE_MEM_MAP static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { unsigned long __maybe_unused start = 0; @@ -6160,7 +6161,6 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) if (!pgdat->node_spanned_pages) return; -#ifdef CONFIG_FLAT_NODE_MEM_MAP start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); offset = pgdat->node_start_pfn - start; /* ia64 gets its own node_mem_map, before this, without bootmem */ @@ -6182,6 +6182,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) pgdat->node_id); pgdat->node_mem_map = map + offset; } + pr_debug("%s: node %d, pgdat %08lx, node_mem_map %08lx\n", + __func__, pgdat->node_id, (unsigned long)pgdat, + (unsigned long)pgdat->node_mem_map); #ifndef CONFIG_NEED_MULTIPLE_NODES /* * With no DISCONTIG, the global mem_map is just set as node 0's @@ -6194,8 +6197,10 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ } #endif -#endif /* CONFIG_FLAT_NODE_MEM_MAP */ } +#else +static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) @@ -6222,11 +6227,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); -#ifdef CONFIG_FLAT_NODE_MEM_MAP - printk(KERN_DEBUG "free_area_init_node: node %d, pgdat %08lx, node_mem_map %08lx\n", - nid, (unsigned long)pgdat, - (unsigned long)pgdat->node_mem_map); -#endif reset_deferred_meminit(pgdat); free_area_init_core(pgdat); From 1b7176aea0a924ac59c6a283129d3e8eb00aa915 Mon Sep 17 00:00:00 2001 From: Fan Du Date: Wed, 15 Nov 2017 17:39:21 -0800 Subject: [PATCH 131/131] memory hotplug: fix comments when adding section Here, pfn_to_node should be page_to_nid. Link: http://lkml.kernel.org/r/1510735205-22540-1-git-send-email-fan.du@intel.com Signed-off-by: Fan Du Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index fab51a6af962..c52aa05b106c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -265,7 +265,7 @@ static int __meminit __add_section(int nid, unsigned long phys_start_pfn, /* * Make all the pages reserved so that nobody will stumble over half * initialized state. - * FIXME: We also have to associate it with a node because pfn_to_node + * FIXME: We also have to associate it with a node because page_to_nid * relies on having page with the proper node. */ for (i = 0; i < PAGES_PER_SECTION; i++) {