From e89166990f11c3f21e1649d760dd35f9e410321c Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Jan 2018 11:02:50 -0700 Subject: [PATCH 001/149] Btrfs: fix deadlock in run_delalloc_nocow @cur_offset is not set back to what it should be (@cow_start) if btrfs_next_leaf() returns something wrong, and the range [cow_start, cur_offset) remains locked forever. cc: Signed-off-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c5f31817778b..a68a4acd16e5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -1334,8 +1334,11 @@ next_slot: leaf = path->nodes[0]; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; goto error; + } if (ret > 0) break; leaf = path->nodes[0]; From 1846430c24d66e85cc58286b3319c82cd54debb2 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Jan 2018 11:02:51 -0700 Subject: [PATCH 002/149] Btrfs: fix crash due to not cleaning up tree log block's dirty bits In cases that the whole fs flips into readonly status due to failures in critical sections, then log tree's blocks are still dirty, and this leads to a crash during umount time, the crash is about use-after-free, umount -> close_ctree -> stop workers -> iput(btree_inode) -> iput_final -> write_inode_now -> ... -> queue job on stop'd workers cc: v3.12+ Fixes: 681ae50917df ("Btrfs: cleanup reserved space when freeing tree log on error") Signed-off-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ee1aaed1330e..1920c2149f88 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2471,6 +2471,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + } else { + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) + clear_extent_buffer_dirty(next); } WARN_ON(root_owner != @@ -2551,6 +2554,9 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + } else { + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) + clear_extent_buffer_dirty(next); } WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); @@ -2629,6 +2635,9 @@ static int walk_log_tree(struct btrfs_trans_handle *trans, clean_tree_block(fs_info, next); btrfs_wait_tree_block_writeback(next); btrfs_tree_unlock(next); + } else { + if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &next->bflags)) + clear_extent_buffer_dirty(next); } WARN_ON(log->root_key.objectid != From 55237a5f2431a72435e3ed39e4306e973c0446b7 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Jan 2018 11:02:52 -0700 Subject: [PATCH 003/149] Btrfs: fix extent state leak from tree log It's possible that btrfs_sync_log() bails out after one of the two btrfs_write_marked_extents() which convert extent state's state bit into EXTENT_NEED_WAIT from EXTENT_DIRTY/EXTENT_NEW, however only EXTENT_DIRTY and EXTENT_NEW are searched by free_log_tree() so that those extent states with EXTENT_NEED_WAIT lead to memory leak. cc: Signed-off-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1920c2149f88..79af4ae042ae 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3026,13 +3026,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW, + 0, &start, &end, + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT, NULL); if (ret) break; clear_extent_bits(&log->dirty_log_pages, start, end, - EXTENT_DIRTY | EXTENT_NEW); + EXTENT_DIRTY | EXTENT_NEW | EXTENT_NEED_WAIT); } /* From e8f1bc1493855e32b7a2a019decc3c353d94daf6 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Jan 2018 11:02:53 -0700 Subject: [PATCH 004/149] Btrfs: fix btrfs_evict_inode to handle abnormal inodes correctly This regression is introduced in commit 3d48d9810de4 ("btrfs: Handle uninitialised inode eviction"). There are two problems, a) it is ->destroy_inode() that does the final free on inode, not ->evict_inode(), b) clear_inode() must be called before ->evict_inode() returns. This could end up hitting BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); in evict() because I_CLEAR is set in clear_inode(). Fixes: commit 3d48d9810de4 ("btrfs: Handle uninitialised inode eviction") Cc: # v4.7-rc6+ Signed-off-by: Liu Bo Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a68a4acd16e5..44a152d8f32f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5281,7 +5281,7 @@ void btrfs_evict_inode(struct inode *inode) trace_btrfs_inode_evict(inode); if (!root) { - kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); + clear_inode(inode); return; } From 1a932ef4e47984dee227834667b5ff5a334e4805 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Jan 2018 11:02:54 -0700 Subject: [PATCH 005/149] Btrfs: fix use-after-free on root->orphan_block_rsv I got these from running generic/475, WARNING: CPU: 0 PID: 26384 at fs/btrfs/inode.c:3326 btrfs_orphan_commit_root+0x1ac/0x2b0 [btrfs] BUG: unable to handle kernel NULL pointer dereference at 0000000000000010 IP: btrfs_block_rsv_release+0x1c/0x70 [btrfs] Call Trace: btrfs_orphan_release_metadata+0x9f/0x200 [btrfs] btrfs_orphan_del+0x10d/0x170 [btrfs] btrfs_setattr+0x500/0x640 [btrfs] notify_change+0x7ae/0x870 do_truncate+0xca/0x130 vfs_truncate+0x2ee/0x3d0 do_sys_truncate+0xaf/0xf0 SyS_truncate+0xe/0x10 entry_SYSCALL_64_fastpath+0x1f/0x96 The race is between btrfs_orphan_commit_root and btrfs_orphan_del, t1 t2 btrfs_orphan_commit_root btrfs_orphan_del spin_lock check (&root->orphan_inodes) root->orphan_block_rsv = NULL; spin_unlock atomic_dec(&root->orphan_inodes); access root->orphan_block_rsv Accessing root->orphan_block_rsv must be done before decreasing root->orphan_inodes. cc: v3.12+ Fixes: 703c88e03524 ("Btrfs: fix tracking of orphan inode count") Signed-off-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 44a152d8f32f..29b491328f4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3387,6 +3387,11 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, ret = btrfs_orphan_reserve_metadata(trans, inode); ASSERT(!ret); if (ret) { + /* + * dec doesn't need spin_lock as ->orphan_block_rsv + * would be released only if ->orphan_inodes is + * zero. + */ atomic_dec(&root->orphan_inodes); clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &inode->runtime_flags); @@ -3401,12 +3406,17 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, if (insert >= 1) { ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode)); if (ret) { - atomic_dec(&root->orphan_inodes); if (reserve) { clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &inode->runtime_flags); btrfs_orphan_release_metadata(inode); } + /* + * btrfs_orphan_commit_root may race with us and set + * ->orphan_block_rsv to zero, in order to avoid that, + * decrease ->orphan_inodes after everything is done. + */ + atomic_dec(&root->orphan_inodes); if (ret != -EEXIST) { clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags); @@ -3438,29 +3448,27 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; int delete_item = 0; - int release_rsv = 0; int ret = 0; - spin_lock(&root->orphan_lock); if (test_and_clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &inode->runtime_flags)) delete_item = 1; + if (delete_item && trans) + ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode)); + if (test_and_clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED, &inode->runtime_flags)) - release_rsv = 1; - spin_unlock(&root->orphan_lock); - - if (delete_item) { - atomic_dec(&root->orphan_inodes); - if (trans) - ret = btrfs_del_orphan_item(trans, root, - btrfs_ino(inode)); - } - - if (release_rsv) btrfs_orphan_release_metadata(inode); + /* + * btrfs_orphan_commit_root may race with us and set ->orphan_block_rsv + * to zero, in order to avoid that, decrease ->orphan_inodes after + * everything is done. + */ + if (delete_item) + atomic_dec(&root->orphan_inodes); + return ret; } From 900c9981680067573671ecc5cbfa7c5770be3a40 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 25 Jan 2018 11:02:56 -0700 Subject: [PATCH 006/149] Btrfs: fix unexpected -EEXIST when creating new inode The highest objectid, which is assigned to new inode, is decided at the time of initializing fs roots. However, in cases where log replay gets processed, the btree which fs root owns might be changed, so we have to search it again for the highest objectid, otherwise creating new inode would end up with -EEXIST. cc: v4.4-rc6+ Fixes: f32e48e92596 ("Btrfs: Initialize btrfs_root->highest_objectid when loading tree root and subvolume roots") Signed-off-by: Liu Bo Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 79af4ae042ae..61f20c367aaf 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -28,6 +28,7 @@ #include "hash.h" #include "compression.h" #include "qgroup.h" +#include "inode-map.h" /* magic values for the inode_only field in btrfs_log_inode: * @@ -5685,6 +5686,23 @@ again: path); } + if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { + struct btrfs_root *root = wc.replay_dest; + + btrfs_release_path(path); + + /* + * We have just replayed everything, and the highest + * objectid of fs roots probably has changed in case + * some inode_item's got replayed. + * + * root->objectid_mutex is not acquired as log replay + * could only happen during mount. + */ + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + } + key.offset = found_key.offset - 1; wc.replay_dest->log_root = NULL; free_extent_buffer(log->node); From 952bd3db0dada9994fa7edd891178075abcc045d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 29 Jan 2018 15:53:01 +0200 Subject: [PATCH 007/149] btrfs: Ignore errors from btrfs_qgroup_trace_extent_post Running generic/019 with qgroups on the scratch device enabled is almost guaranteed to trigger the BUG_ON in btrfs_free_tree_block. It's supposed to trigger only on -ENOMEM, in reality, however, it's possible to get -EIO from btrfs_qgroup_trace_extent_post. This function just finds the roots of the extent being tracked and sets the qrecord->old_roots list. If this operation fails nothing critical happens except the quota accounting can be considered wrong. In such case just set the INCONSISTENT flag for the quota and print a warning, rather than killing off the system. Additionally, it's possible to trigger a BUG_ON in btrfs_truncate_inode_items as well. Signed-off-by: Nikolay Borisov Reviewed-by: Qu Wenruo [ error message adjustments ] Signed-off-by: David Sterba --- fs/btrfs/delayed-ref.c | 3 ++- fs/btrfs/qgroup.c | 9 +++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index a1a40cf382e3..7ab5e0128f0c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -821,7 +821,8 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); if (qrecord_inserted) - return btrfs_qgroup_trace_extent_post(fs_info, record); + btrfs_qgroup_trace_extent_post(fs_info, record); + return 0; free_head_ref: diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 9e61dd624f7b..aa259d6986e1 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -1442,8 +1442,13 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, int ret; ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); - if (ret < 0) - return ret; + if (ret < 0) { + fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; + btrfs_warn(fs_info, +"error accounting new delayed refs extent (err code: %d), quota inconsistent", + ret); + return 0; + } /* * Here we don't need to get the lock of From c8195a7b1ad5648857ce20ba24f384faed8512bc Mon Sep 17 00:00:00 2001 From: Zygo Blaxell Date: Tue, 23 Jan 2018 22:22:09 -0500 Subject: [PATCH 008/149] btrfs: remove spurious WARN_ON(ref->count < 0) in find_parent_nodes Until v4.14, this warning was very infrequent: WARNING: CPU: 3 PID: 18172 at fs/btrfs/backref.c:1391 find_parent_nodes+0xc41/0x14e0 Modules linked in: [...] CPU: 3 PID: 18172 Comm: bees Tainted: G D W L 4.11.9-zb64+ #1 Hardware name: System manufacturer System Product Name/M5A78L-M/USB3, BIOS 2101 12/02/2014 Call Trace: dump_stack+0x85/0xc2 __warn+0xd1/0xf0 warn_slowpath_null+0x1d/0x20 find_parent_nodes+0xc41/0x14e0 __btrfs_find_all_roots+0xad/0x120 ? extent_same_check_offsets+0x70/0x70 iterate_extent_inodes+0x168/0x300 iterate_inodes_from_logical+0x87/0xb0 ? iterate_inodes_from_logical+0x87/0xb0 ? extent_same_check_offsets+0x70/0x70 btrfs_ioctl+0x8ac/0x2820 ? lock_acquire+0xc2/0x200 do_vfs_ioctl+0x91/0x700 ? __fget+0x112/0x200 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x23/0xc6 ? trace_hardirqs_off_caller+0x1f/0x140 Starting with v4.14 (specifically 86d5f9944252 ("btrfs: convert prelimary reference tracking to use rbtrees")) the WARN_ON occurs three orders of magnitude more frequently--almost once per second while running workloads like bees. Replace the WARN_ON() with a comment rationale for its removal. The rationale is paraphrased from an explanation by Edmund Nadolski on the linux-btrfs mailing list. Fixes: 8da6d5815c59 ("Btrfs: added btrfs_find_all_roots()") Signed-off-by: Zygo Blaxell Reviewed-by: Lu Fengqi Signed-off-by: David Sterba --- fs/btrfs/backref.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e4054e533f6d..f94b2d8c744a 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1264,7 +1264,16 @@ again: while (node) { ref = rb_entry(node, struct prelim_ref, rbnode); node = rb_next(&ref->rbnode); - WARN_ON(ref->count < 0); + /* + * ref->count < 0 can happen here if there are delayed + * refs with a node->action of BTRFS_DROP_DELAYED_REF. + * prelim_ref_insert() relies on this when merging + * identical refs to keep the overall count correct. + * prelim_ref_insert() will merge only those refs + * which compare identically. Any refs having + * e.g. different offsets would not be merged, + * and would retain their original ref->count < 0. + */ if (roots && ref->count && ref->root_id && ref->parent == 0) { if (sc && sc->root_objectid && ref->root_id != sc->root_objectid) { From 627e08738e4315458c5df06358ce7a65cfdd635d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 30 Jan 2018 18:40:22 +0000 Subject: [PATCH 009/149] Btrfs: fix null pointer dereference when replacing missing device When we are replacing a missing device we mount the filesystem with the degraded mode option in which case we are allowed to have a btrfs device structure without a backing device member (its bdev member is NULL) and therefore we can't dereference that member. Commit 38b5f68e9811 ("btrfs: drop btrfs_device::can_discard to query directly") started to dereference that member when discarding extents, resulting in a null pointer dereference: [ 3145.322257] BTRFS warning (device sdf): devid 2 uuid 4d922414-58eb-4880-8fed-9c3840f6c5d5 is missing [ 3145.364116] BTRFS info (device sdf): dev_replace from (devid 2) to /dev/sdg started [ 3145.413489] BUG: unable to handle kernel NULL pointer dereference at 00000000000000e0 [ 3145.415085] IP: btrfs_discard_extent+0x6a/0xf8 [btrfs] [ 3145.415085] PGD 0 P4D 0 [ 3145.415085] Oops: 0000 [#1] PREEMPT SMP PTI [ 3145.415085] Modules linked in: ppdev ghash_clmulni_intel pcbc aesni_intel aes_x86_64 crypto_simd cryptd glue_helper evdev psmouse parport_pc serio_raw i2c_piix4 i2 [ 3145.415085] CPU: 0 PID: 11989 Comm: btrfs Tainted: G W 4.15.0-rc9-btrfs-next-55+ #1 [ 3145.415085] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.10.2-0-g5f4c7b1-prebuilt.qemu-project.org 04/01/2014 [ 3145.415085] RIP: 0010:btrfs_discard_extent+0x6a/0xf8 [btrfs] [ 3145.415085] RSP: 0018:ffffc90004813c60 EFLAGS: 00010293 [ 3145.415085] RAX: ffff88020d39cc00 RBX: ffff88020c4ea2a0 RCX: 0000000000000002 [ 3145.415085] RDX: 0000000000000000 RSI: ffff88020c4ea240 RDI: 0000000000000000 [ 3145.415085] RBP: 0000000000000000 R08: 0000000000004000 R09: 0000000000000000 [ 3145.415085] R10: ffffc90004813ae8 R11: 0000000000000000 R12: 0000000000000000 [ 3145.415085] R13: ffff88020c418000 R14: 0000000000000000 R15: 0000000000000000 [ 3145.415085] FS: 00007f565681f8c0(0000) GS:ffff88023fc00000(0000) knlGS:0000000000000000 [ 3145.415085] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 3145.415085] CR2: 00000000000000e0 CR3: 000000020d208006 CR4: 00000000001606f0 [ 3145.415085] Call Trace: [ 3145.415085] btrfs_finish_extent_commit+0x9a/0x1be [btrfs] [ 3145.415085] btrfs_commit_transaction+0x649/0x7a0 [btrfs] [ 3145.415085] ? start_transaction+0x2b0/0x3b3 [btrfs] [ 3145.415085] btrfs_dev_replace_start+0x274/0x30c [btrfs] [ 3145.415085] btrfs_dev_replace_by_ioctl+0x45/0x59 [btrfs] [ 3145.415085] btrfs_ioctl+0x1a91/0x1d62 [btrfs] [ 3145.415085] ? lock_acquire+0x16a/0x1af [ 3145.415085] ? vfs_ioctl+0x1b/0x28 [ 3145.415085] ? trace_hardirqs_on_caller+0x14c/0x1a6 [ 3145.415085] vfs_ioctl+0x1b/0x28 [ 3145.415085] do_vfs_ioctl+0x5a9/0x5e0 [ 3145.415085] ? _raw_spin_unlock_irq+0x34/0x46 [ 3145.415085] ? entry_SYSCALL_64_fastpath+0x5/0x8b [ 3145.415085] ? trace_hardirqs_on_caller+0x14c/0x1a6 [ 3145.415085] SyS_ioctl+0x52/0x76 [ 3145.415085] entry_SYSCALL_64_fastpath+0x1e/0x8b [ 3145.415085] RIP: 0033:0x7f56558b3c47 [ 3145.415085] RSP: 002b:00007ffdcfac4c58 EFLAGS: 00000202 [ 3145.415085] Code: be 02 00 00 00 4c 89 ef e8 b9 e7 03 00 85 c0 89 c5 75 75 48 8b 44 24 08 45 31 f6 48 8d 58 60 eb 52 48 8b 03 48 8b b8 a0 00 00 00 <48> 8b 87 e0 00 [ 3145.415085] RIP: btrfs_discard_extent+0x6a/0xf8 [btrfs] RSP: ffffc90004813c60 [ 3145.415085] CR2: 00000000000000e0 [ 3145.458185] ---[ end trace 06302e7ac31902bf ]--- This is trivially reproduced by running the test btrfs/027 from fstests like this: $ MOUNT_OPTIONS="-o discard" ./check btrfs/027 Fix this by skipping devices without a backing device before attempting to discard. Fixes: 38b5f68e9811 ("btrfs: drop btrfs_device::can_discard to query directly") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 05751a677da4..c1618ab9fecf 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2147,6 +2147,10 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 bytes; struct request_queue *req_q; + if (!stripe->dev->bdev) { + ASSERT(btrfs_test_opt(fs_info, DEGRADED)); + continue; + } req_q = bdev_get_queue(stripe->dev->bdev); if (!blk_queue_discard(req_q)) continue; From fd649f10c3d21ee9d7542c609f29978bdf73ab94 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 30 Jan 2018 16:07:37 +0200 Subject: [PATCH 010/149] btrfs: Fix use-after-free when cleaning up fs_devs with a single stale device Commit 4fde46f0cc71 ("Btrfs: free the stale device") introduced btrfs_free_stale_device which iterates the device lists for all registered btrfs filesystems and deletes those devices which aren't mounted. In a btrfs_devices structure has only 1 device attached to it and it is unused then btrfs_free_stale_devices will proceed to also free the btrfs_fs_devices struct itself. Currently this leads to a use after free since list_for_each_entry will try to perform a check on the already freed memory to see if it has to terminate the loop. The fix is to use 'break' when we know we are freeing the current fs_devs. Fixes: 4fde46f0cc71 ("Btrfs: free the stale device") Signed-off-by: Nikolay Borisov Reviewed-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b5036bd69e6a..2ceb924ca0d6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -645,6 +645,7 @@ static void btrfs_free_stale_devices(const char *path, btrfs_sysfs_remove_fsid(fs_devs); list_del(&fs_devs->list); free_fs_devices(fs_devs); + break; } else { fs_devs->num_devices--; list_del(&dev->dev_list); From 3f2f7c553d077be6a30cb96b2976a2c940bf5335 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Mon, 29 Jan 2018 14:23:15 +0800 Subject: [PATCH 011/149] ALSA: hda - Fix headset mic detection problem for two Dell machines One of them has the codec of alc256 and the other one has the codec of alc289. Cc: Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 23475888192b..b791efe07fc0 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6734,6 +6734,11 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0xb7a60130}, {0x14, 0x90170110}, {0x21, 0x02211020}), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x12, 0x90a60130}, + {0x14, 0x90170110}, + {0x14, 0x01011020}, + {0x21, 0x0221101f}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, ALC256_STANDARD_PINS), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC256_FIXUP_ASUS_MIC, @@ -6803,6 +6808,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60120}, {0x14, 0x90170110}, {0x21, 0x0321101f}), + SND_HDA_PIN_QUIRK(0x10ec0289, 0x1028, "Dell", ALC225_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x12, 0xb7a60130}, + {0x14, 0x90170110}, + {0x21, 0x04211020}), SND_HDA_PIN_QUIRK(0x10ec0290, 0x103c, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC1, ALC290_STANDARD_PINS, {0x15, 0x04211040}, From 40e2c4e5a7efcd50983aacbddd3c617e776018bf Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 2 Feb 2018 15:13:09 +0800 Subject: [PATCH 012/149] ALSA: hda/realtek - Add headset mode support for Dell laptop This platform had two Dmic and single Dmic. This update was for single Dmic. This commit was for two Dmic. Fixes: 75ee94b20b46 ("ALSA: hda - fix headset mic problem for Dell machines...") Signed-off-by: Kailang Yang Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index b791efe07fc0..0c61751caa0a 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6321,6 +6321,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x075d, "Dell AIO", ALC298_FIXUP_SPK_VOLUME), SND_PCI_QUIRK(0x1028, 0x0798, "Dell Inspiron 17 7000 Gaming", ALC256_FIXUP_DELL_INSPIRON_7559_SUBWOOFER), SND_PCI_QUIRK(0x1028, 0x082a, "Dell XPS 13 9360", ALC256_FIXUP_DELL_XPS_13_HEADPHONE_NOISE), + SND_PCI_QUIRK(0x1028, 0x084b, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), + SND_PCI_QUIRK(0x1028, 0x084e, "Dell", ALC274_FIXUP_DELL_AIO_LINEOUT_VERB), SND_PCI_QUIRK(0x1028, 0x164a, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x1028, 0x164b, "Dell", ALC293_FIXUP_DELL1_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1586, "HP", ALC269_FIXUP_HP_MUTE_LED_MIC2), From 61fcf8ece9b6b09450250c4ca40cc3b81a96a68d Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 2 Feb 2018 15:26:46 +0800 Subject: [PATCH 013/149] ALSA: hda/realtek - Enable Thinkpad Dock device for ALC298 platform Thinkpad Dock device support for ALC298 platform. It need to use SSID for the quirk table. Because IdeaPad also has ALC298 platform. Use verb for the quirk table will confuse. Signed-off-by: Kailang Yang Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 0c61751caa0a..32938ca8e5e3 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -4972,6 +4972,28 @@ static void alc_fixup_tpt440_dock(struct hda_codec *codec, } } +static void alc_fixup_tpt470_dock(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + static const struct hda_pintbl pincfgs[] = { + { 0x17, 0x21211010 }, /* dock headphone */ + { 0x19, 0x21a11010 }, /* dock mic */ + { } + }; + struct alc_spec *spec = codec->spec; + + if (action == HDA_FIXUP_ACT_PRE_PROBE) { + spec->parse_flags = HDA_PINCFG_NO_HP_FIXUP; + /* Enable DOCK device */ + snd_hda_codec_write(codec, 0x17, 0, + AC_VERB_SET_CONFIG_DEFAULT_BYTES_3, 0); + /* Enable DOCK device */ + snd_hda_codec_write(codec, 0x19, 0, + AC_VERB_SET_CONFIG_DEFAULT_BYTES_3, 0); + snd_hda_apply_pincfgs(codec, pincfgs); + } +} + static void alc_shutup_dell_xps13(struct hda_codec *codec) { struct alc_spec *spec = codec->spec; @@ -5446,6 +5468,7 @@ enum { ALC700_FIXUP_INTEL_REFERENCE, ALC274_FIXUP_DELL_BIND_DACS, ALC274_FIXUP_DELL_AIO_LINEOUT_VERB, + ALC298_FIXUP_TPT470_DOCK, }; static const struct hda_fixup alc269_fixups[] = { @@ -6271,6 +6294,12 @@ static const struct hda_fixup alc269_fixups[] = { .chained = true, .chain_id = ALC274_FIXUP_DELL_BIND_DACS }, + [ALC298_FIXUP_TPT470_DOCK] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc_fixup_tpt470_dock, + .chained = true, + .chain_id = ALC293_FIXUP_LENOVO_SPK_NOISE + }, }; static const struct snd_pci_quirk alc269_fixup_tbl[] = { @@ -6452,8 +6481,16 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x2218, "Thinkpad X1 Carbon 2nd", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x2223, "ThinkPad T550", ALC292_FIXUP_TPT440_DOCK), SND_PCI_QUIRK(0x17aa, 0x2226, "ThinkPad X250", ALC292_FIXUP_TPT440_DOCK), + SND_PCI_QUIRK(0x17aa, 0x222d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x222e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x2231, "Thinkpad T560", ALC292_FIXUP_TPT460), SND_PCI_QUIRK(0x17aa, 0x2233, "Thinkpad", ALC292_FIXUP_TPT460), + SND_PCI_QUIRK(0x17aa, 0x2245, "Thinkpad T470", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x2246, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x2247, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x224b, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x224c, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x224d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x30bb, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x30e2, "ThinkCentre AIO", ALC233_FIXUP_LENOVO_LINE2_MIC_HOTKEY), SND_PCI_QUIRK(0x17aa, 0x310c, "ThinkCentre Station", ALC294_FIXUP_LENOVO_MIC_LOCATION), @@ -6474,7 +6511,12 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x5050, "Thinkpad T560p", ALC292_FIXUP_TPT460), SND_PCI_QUIRK(0x17aa, 0x5051, "Thinkpad L460", ALC292_FIXUP_TPT460), SND_PCI_QUIRK(0x17aa, 0x5053, "Thinkpad T460", ALC292_FIXUP_TPT460), + SND_PCI_QUIRK(0x17aa, 0x505d, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x505f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x5062, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x5109, "Thinkpad", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), + SND_PCI_QUIRK(0x17aa, 0x511e, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), + SND_PCI_QUIRK(0x17aa, 0x511f, "Thinkpad", ALC298_FIXUP_TPT470_DOCK), SND_PCI_QUIRK(0x17aa, 0x3bf8, "Quanta FL1", ALC269_FIXUP_PCM_44K), SND_PCI_QUIRK(0x17aa, 0x9e54, "LENOVO NB", ALC269_FIXUP_LENOVO_EAPD), SND_PCI_QUIRK(0x1b7d, 0xa831, "Ordissimo EVE2 ", ALC269VB_FIXUP_ORDISSIMO_EVE2), /* Also known as Malata PC-B1303 */ From ad6a0a52e6de3d1161b7999c7903db906ba4cf79 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 31 Jan 2018 18:31:24 +0200 Subject: [PATCH 014/149] nvme: rename NVME_CTRL_RECONNECTING state to NVME_CTRL_CONNECTING In pci transport, this state is used to mark the initialization process. This should be also used in other transports as well. Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 10 +++++----- drivers/nvme/host/fabrics.h | 9 +++++---- drivers/nvme/host/fc.c | 14 +++++++------- drivers/nvme/host/nvme.h | 2 +- drivers/nvme/host/pci.c | 8 ++++---- drivers/nvme/host/rdma.c | 6 +++--- 6 files changed, 25 insertions(+), 24 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f431c32774f3..1033de4136e0 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -265,7 +265,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (new_state) { case NVME_CTRL_ADMIN_ONLY: switch (old_state) { - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ default: @@ -276,7 +276,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, switch (old_state) { case NVME_CTRL_NEW: case NVME_CTRL_RESETTING: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ default: @@ -294,7 +294,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; } break; - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: switch (old_state) { case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: @@ -309,7 +309,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_LIVE: case NVME_CTRL_ADMIN_ONLY: case NVME_CTRL_RESETTING: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: changed = true; /* FALLTHRU */ default: @@ -2687,7 +2687,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev, [NVME_CTRL_LIVE] = "live", [NVME_CTRL_ADMIN_ONLY] = "only-admin", [NVME_CTRL_RESETTING] = "resetting", - [NVME_CTRL_RECONNECTING]= "reconnecting", + [NVME_CTRL_CONNECTING] = "connecting", [NVME_CTRL_DELETING] = "deleting", [NVME_CTRL_DEAD] = "dead", }; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 25b19f722f5b..a3145d90c1d2 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -171,13 +171,14 @@ static inline blk_status_t nvmf_check_init_req(struct nvme_ctrl *ctrl, cmd->common.opcode != nvme_fabrics_command || cmd->fabrics.fctype != nvme_fabrics_type_connect) { /* - * Reconnecting state means transport disruption, which can take - * a long time and even might fail permanently, fail fast to - * give upper layers a chance to failover. + * Connecting state means transport disruption or initial + * establishment, which can take a long time and even might + * fail permanently, fail fast to give upper layers a chance + * to failover. * Deleting state means that the ctrl will never accept commands * again, fail it permanently. */ - if (ctrl->state == NVME_CTRL_RECONNECTING || + if (ctrl->state == NVME_CTRL_CONNECTING || ctrl->state == NVME_CTRL_DELETING) { nvme_req(rq)->status = NVME_SC_ABORT_REQ; return BLK_STS_IOERR; diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index b856d7c919d2..e2df22d56b2a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -532,7 +532,7 @@ nvme_fc_resume_controller(struct nvme_fc_ctrl *ctrl) { switch (ctrl->ctrl.state) { case NVME_CTRL_NEW: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: /* * As all reconnects were suppressed, schedule a * connect. @@ -777,7 +777,7 @@ nvme_fc_ctrl_connectivity_loss(struct nvme_fc_ctrl *ctrl) } break; - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: /* * The association has already been terminated and the * controller is attempting reconnects. No need to do anything @@ -1722,7 +1722,7 @@ done: if (status && (blk_queue_dying(rq->q) || ctrl->ctrl.state == NVME_CTRL_NEW || - ctrl->ctrl.state == NVME_CTRL_RECONNECTING)) + ctrl->ctrl.state == NVME_CTRL_CONNECTING)) status |= cpu_to_le16(NVME_SC_DNR << 1); if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) @@ -2943,7 +2943,7 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status) unsigned long recon_delay = ctrl->ctrl.opts->reconnect_delay * HZ; bool recon = true; - if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) return; if (portptr->port_state == FC_OBJSTATE_ONLINE) @@ -2991,10 +2991,10 @@ nvme_fc_reset_ctrl_work(struct work_struct *work) /* will block will waiting for io to terminate */ nvme_fc_delete_association(ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { dev_err(ctrl->ctrl.device, "NVME-FC{%d}: error_recovery: Couldn't change state " - "to RECONNECTING\n", ctrl->cnum); + "to CONNECTING\n", ctrl->cnum); return; } @@ -3195,7 +3195,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, * transport errors (frame drop, LS failure) inherently must kill * the association. The transport is coded so that any command used * to create the association (prior to a LIVE state transition - * while NEW or RECONNECTING) will fail if it completes in error or + * while NEW or CONNECTING) will fail if it completes in error or * times out. * * As such: as the connect request was mostly likely due to a diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 8e4550fa08f8..27e31c00b306 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -123,7 +123,7 @@ enum nvme_ctrl_state { NVME_CTRL_LIVE, NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */ NVME_CTRL_RESETTING, - NVME_CTRL_RECONNECTING, + NVME_CTRL_CONNECTING, NVME_CTRL_DELETING, NVME_CTRL_DEAD, }; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 6fe7af00a1f4..ab9c19525fa8 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1141,7 +1141,7 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) /* If there is a reset/reinit ongoing, we shouldn't reset again. */ switch (dev->ctrl.state) { case NVME_CTRL_RESETTING: - case NVME_CTRL_RECONNECTING: + case NVME_CTRL_CONNECTING: return false; default: break; @@ -2288,12 +2288,12 @@ static void nvme_reset_work(struct work_struct *work) nvme_dev_disable(dev, false); /* - * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the + * Introduce CONNECTING state from nvme-fc/rdma transports to mark the * initializing procedure here. */ - if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) { dev_warn(dev->ctrl.device, - "failed to mark controller RECONNECTING\n"); + "failed to mark controller CONNECTING\n"); goto out; } diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 2bc059f7d73c..050eaa24cc7d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -887,7 +887,7 @@ free_ctrl: static void nvme_rdma_reconnect_or_remove(struct nvme_rdma_ctrl *ctrl) { /* If we are resetting/deleting then do nothing */ - if (ctrl->ctrl.state != NVME_CTRL_RECONNECTING) { + if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) { WARN_ON_ONCE(ctrl->ctrl.state == NVME_CTRL_NEW || ctrl->ctrl.state == NVME_CTRL_LIVE); return; @@ -973,7 +973,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_start_queues(&ctrl->ctrl); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure should never happen */ WARN_ON_ONCE(1); return; @@ -1756,7 +1756,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) nvme_stop_ctrl(&ctrl->ctrl); nvme_rdma_shutdown_ctrl(ctrl, false); - if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RECONNECTING)) { + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure should never happen */ WARN_ON_ONCE(1); return; From b754a32c66772e6510acd92237aadd4cf227ae39 Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 31 Jan 2018 18:31:25 +0200 Subject: [PATCH 015/149] nvme-rdma: use NVME_CTRL_CONNECTING state to mark init process In order to avoid concurrent error recovery during initialization process (allowed by the NVME_CTRL_NEW --> NVME_CTRL_RESETTING transition) we must mark the ctrl as CONNECTING before initial connection establisment. Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 1 + drivers/nvme/host/rdma.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1033de4136e0..86dca2919e19 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -296,6 +296,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, break; case NVME_CTRL_CONNECTING: switch (old_state) { + case NVME_CTRL_NEW: case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: changed = true; diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 050eaa24cc7d..5e2cc4f0d207 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1942,6 +1942,9 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev, if (!ctrl->queues) goto out_uninit_ctrl; + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING); + WARN_ON_ONCE(!changed); + ret = nvme_rdma_configure_admin_queue(ctrl, true); if (ret) goto out_kfree_queues; From 3096a739d2ccbfd6a626e388228a16558f76d79d Mon Sep 17 00:00:00 2001 From: Max Gurtovoy Date: Wed, 31 Jan 2018 18:31:26 +0200 Subject: [PATCH 016/149] nvme: delete NVME_CTRL_LIVE --> NVME_CTRL_CONNECTING transition There is no logical reason to move from live state to connecting state. In case of initial connection establishment, the transition should be NVME_CTRL_NEW --> NVME_CTRL_CONNECTING --> NVME_CTRL_LIVE. In case of error recovery or reset, the transition should be NVME_CTRL_LIVE --> NVME_CTRL_RESETTING --> NVME_CTRL_CONNECTING --> NVME_CTRL_LIVE. Signed-off-by: Max Gurtovoy Reviewed-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 86dca2919e19..1f9278364196 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -297,7 +297,6 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, case NVME_CTRL_CONNECTING: switch (old_state) { case NVME_CTRL_NEW: - case NVME_CTRL_LIVE: case NVME_CTRL_RESETTING: changed = true; /* FALLTHRU */ From 8cb6af7b3a6d47f95ecb461a3f8d39cf6a64e4ae Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 31 Jan 2018 17:01:58 -0700 Subject: [PATCH 017/149] nvme: Fix discard buffer overrun This patch checks the discard range array bounds before setting it in case the driver gets a badly formed request. Signed-off-by: Keith Busch Reviewed-by: Jens Axboe Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 1f9278364196..2fd8688cfa47 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -518,9 +518,11 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector); u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; - range[n].cattr = cpu_to_le32(0); - range[n].nlb = cpu_to_le32(nlb); - range[n].slba = cpu_to_le64(slba); + if (n < segments) { + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + } n++; } From 6e59de2048eb375a9bfcd39461ef841cd2a78962 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Thu, 8 Feb 2018 17:46:01 +0800 Subject: [PATCH 018/149] drm/amdgpu: add new device to use atpx quirk The affected system (0x0813) is pretty similar to another one (0x0812), it also needs to use ATPX power control. Signed-off-by: Kai-Heng Feng Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c index e2c3c5ec42d1..c53095b3b0fb 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atpx_handler.c @@ -568,6 +568,7 @@ static const struct amdgpu_px_quirk amdgpu_px_quirk_list[] = { /* HG _PR3 doesn't seem to work on this A+A weston board */ { 0x1002, 0x6900, 0x1002, 0x0124, AMDGPU_PX_QUIRK_FORCE_ATPX }, { 0x1002, 0x6900, 0x1028, 0x0812, AMDGPU_PX_QUIRK_FORCE_ATPX }, + { 0x1002, 0x6900, 0x1028, 0x0813, AMDGPU_PX_QUIRK_FORCE_ATPX }, { 0, 0, 0, 0, 0 }, }; From 26d99834f89e76514076d9cd06f61e56e6a509b8 Mon Sep 17 00:00:00 2001 From: Greg Kurz Date: Mon, 22 Jan 2018 22:02:05 +0100 Subject: [PATCH 019/149] 9p/trans_virtio: discard zero-length reply When a 9p request is successfully flushed, the server is expected to just mark it as used without sending a 9p reply (ie, without writing data into the buffer). In this case, virtqueue_get_buf() will return len == 0 and we must not report a REQ_STATUS_RCVD status to the client, otherwise the client will erroneously assume the request has not been flushed. Cc: stable@vger.kernel.org Signed-off-by: Greg Kurz Signed-off-by: Michael S. Tsirkin --- net/9p/trans_virtio.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index f3a4efcf1456..3aa5a93ad107 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -160,7 +160,8 @@ static void req_done(struct virtqueue *vq) spin_unlock_irqrestore(&chan->lock, flags); /* Wakeup if anyone waiting for VirtIO ring space. */ wake_up(chan->vc_wq); - p9_client_cb(chan->client, req, REQ_STATUS_RCVD); + if (len) + p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } From 6a546c7e69ff0b69581377cc70d7e8a601b98fce Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 9 Feb 2018 15:30:56 -0500 Subject: [PATCH 020/149] membarrier-sync-core: Document architecture support Ensure we gather architecture requirements about each architecture supporting the "sync_core" membarrier command in a single file under Documentation/features. Signed-off-by: Mathieu Desnoyers Cc: Andrea Parri Cc: Andrew Hunter Cc: Andy Lutomirski Cc: Avi Kivity Cc: Benjamin Herrenschmidt Cc: Boqun Feng Cc: Dave Watson Cc: David Sehr Cc: Greg Hackmann Cc: Linus Torvalds Cc: Maged Michael Cc: Michael Ellerman Cc: Paul E. McKenney Cc: Paul Mackerras Cc: Peter Zijlstra (Intel) Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Cc: linux-api@vger.kernel.org Cc: linux-arch@vger.kernel.org Link: http://lkml.kernel.org/r/1518208256-22034-1-git-send-email-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar --- .../membarrier-sync-core/arch-support.txt | 62 +++++++++++++++++++ 1 file changed, 62 insertions(+) create mode 100644 Documentation/features/sched/membarrier-sync-core/arch-support.txt diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt new file mode 100644 index 000000000000..2c815a7f1ba7 --- /dev/null +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -0,0 +1,62 @@ +# +# Feature name: membarrier-sync-core +# Kconfig: ARCH_HAS_MEMBARRIER_SYNC_CORE +# description: arch supports core serializing membarrier +# +# Architecture requirements +# +# * arm64 +# +# Rely on eret context synchronization when returning from IPI handler, and +# when returning to user-space. +# +# * x86 +# +# x86-32 uses IRET as return from interrupt, which takes care of the IPI. +# However, it uses both IRET and SYSEXIT to go back to user-space. The IRET +# instruction is core serializing, but not SYSEXIT. +# +# x86-64 uses IRET as return from interrupt, which takes care of the IPI. +# However, it can return to user-space through either SYSRETL (compat code), +# SYSRETQ, or IRET. +# +# Given that neither SYSRET{L,Q}, nor SYSEXIT, are core serializing, we rely +# instead on write_cr3() performed by switch_mm() to provide core serialization +# after changing the current mm, and deal with the special case of kthread -> +# uthread (temporarily keeping current mm into active_mm) by issuing a +# sync_core_before_usermode() in that specific case. +# + ----------------------- + | arch |status| + ----------------------- + | alpha: | TODO | + | arc: | TODO | + | arm: | TODO | + | arm64: | ok | + | blackfin: | TODO | + | c6x: | TODO | + | cris: | TODO | + | frv: | TODO | + | h8300: | TODO | + | hexagon: | TODO | + | ia64: | TODO | + | m32r: | TODO | + | m68k: | TODO | + | metag: | TODO | + | microblaze: | TODO | + | mips: | TODO | + | mn10300: | TODO | + | nios2: | TODO | + | openrisc: | TODO | + | parisc: | TODO | + | powerpc: | TODO | + | s390: | TODO | + | score: | TODO | + | sh: | TODO | + | sparc: | TODO | + | tile: | TODO | + | um: | TODO | + | unicore32: | TODO | + | x86: | ok | + | xtensa: | TODO | + ----------------------- From 3efd6e8ebe19f0774c82de582849539b60cc4d97 Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 6 Feb 2018 06:48:29 -0800 Subject: [PATCH 021/149] nvme_fc: correct abort race condition on resets During reset handling, there is live io completing while the reset is taking place. The reset path attempts to abort all outstanding io, counting the number of ios that were reset. It then waits for those ios to be reclaimed from the lldd before continuing. The transport's logic on io state and flag setting was poor, allowing ios to complete simultaneous to the abort request. The completed ios were counted, but as the completion had already occurred, the completion never reduced the count. As the count never zeros, the reset/delete never completes. Tighten it up by unconditionally changing the op state to completed when the io done handler is called. The reset/abort path now changes the op state to aborted, but the abort only continues if the op state was live priviously. If complete, the abort is backed out. Thus proper counting of io aborts and their completions is working again. Also removed the TERMIO state on the op as it's redundant with the op's aborted state. Reviewed-by: Johannes Thumshirn Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 98 +++++++++++------------------------------- 1 file changed, 26 insertions(+), 72 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index e2df22d56b2a..4673882ce152 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1512,13 +1512,19 @@ nvme_fc_exit_request(struct blk_mq_tag_set *set, struct request *rq, static int __nvme_fc_abort_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op) { - int state; + unsigned long flags; + int opstate; - state = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); - if (state != FCPOP_STATE_ACTIVE) { - atomic_set(&op->state, state); + spin_lock_irqsave(&ctrl->lock, flags); + opstate = atomic_xchg(&op->state, FCPOP_STATE_ABORTED); + if (opstate != FCPOP_STATE_ACTIVE) + atomic_set(&op->state, opstate); + else if (ctrl->flags & FCCTRL_TERMIO) + ctrl->iocnt++; + spin_unlock_irqrestore(&ctrl->lock, flags); + + if (opstate != FCPOP_STATE_ACTIVE) return -ECANCELED; - } ctrl->lport->ops->fcp_abort(&ctrl->lport->localport, &ctrl->rport->remoteport, @@ -1532,52 +1538,23 @@ static void nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) { struct nvme_fc_fcp_op *aen_op = ctrl->aen_ops; - unsigned long flags; - int i, ret; + int i; - for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) { - if (atomic_read(&aen_op->state) != FCPOP_STATE_ACTIVE) - continue; - - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) { - ctrl->iocnt++; - aen_op->flags |= FCOP_FLAGS_TERMIO; - } - spin_unlock_irqrestore(&ctrl->lock, flags); - - ret = __nvme_fc_abort_op(ctrl, aen_op); - if (ret) { - /* - * if __nvme_fc_abort_op failed the io wasn't - * active. Thus this call path is running in - * parallel to the io complete. Treat as non-error. - */ - - /* back out the flags/counters */ - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; - aen_op->flags &= ~FCOP_FLAGS_TERMIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - return; - } - } + for (i = 0; i < NVME_NR_AEN_COMMANDS; i++, aen_op++) + __nvme_fc_abort_op(ctrl, aen_op); } static inline int __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, - struct nvme_fc_fcp_op *op) + struct nvme_fc_fcp_op *op, int opstate) { unsigned long flags; bool complete_rq = false; spin_lock_irqsave(&ctrl->lock, flags); - if (unlikely(op->flags & FCOP_FLAGS_TERMIO)) { - if (ctrl->flags & FCCTRL_TERMIO) { - if (!--ctrl->iocnt) - wake_up(&ctrl->ioabort_wait); - } + if (opstate == FCPOP_STATE_ABORTED && ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); } if (op->flags & FCOP_FLAGS_RELEASED) complete_rq = true; @@ -1601,6 +1578,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) __le16 status = cpu_to_le16(NVME_SC_SUCCESS << 1); union nvme_result result; bool terminate_assoc = true; + int opstate; /* * WARNING: @@ -1639,11 +1617,12 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) * association to be terminated. */ + opstate = atomic_xchg(&op->state, FCPOP_STATE_COMPLETE); + fc_dma_sync_single_for_cpu(ctrl->lport->dev, op->fcp_req.rspdma, sizeof(op->rsp_iu), DMA_FROM_DEVICE); - if (atomic_read(&op->state) == FCPOP_STATE_ABORTED || - op->flags & FCOP_FLAGS_TERMIO) + if (opstate == FCPOP_STATE_ABORTED) status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); else if (freq->status) status = cpu_to_le16(NVME_SC_INTERNAL << 1); @@ -1708,7 +1687,7 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) done: if (op->flags & FCOP_FLAGS_AEN) { nvme_complete_async_event(&queue->ctrl->ctrl, status, &result); - __nvme_fc_fcpop_chk_teardowns(ctrl, op); + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); atomic_set(&op->state, FCPOP_STATE_IDLE); op->flags = FCOP_FLAGS_AEN; /* clear other flags */ nvme_fc_ctrl_put(ctrl); @@ -1725,7 +1704,7 @@ done: ctrl->ctrl.state == NVME_CTRL_CONNECTING)) status |= cpu_to_le16(NVME_SC_DNR << 1); - if (__nvme_fc_fcpop_chk_teardowns(ctrl, op)) + if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate)) __nvme_fc_final_op_cleanup(rq); else nvme_end_request(rq, status, result); @@ -2421,8 +2400,7 @@ __nvme_fc_final_op_cleanup(struct request *rq) struct nvme_fc_ctrl *ctrl = op->ctrl; atomic_set(&op->state, FCPOP_STATE_IDLE); - op->flags &= ~(FCOP_FLAGS_TERMIO | FCOP_FLAGS_RELEASED | - FCOP_FLAGS_COMPLETE); + op->flags &= ~(FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); @@ -2476,35 +2454,11 @@ nvme_fc_terminate_exchange(struct request *req, void *data, bool reserved) struct nvme_ctrl *nctrl = data; struct nvme_fc_ctrl *ctrl = to_fc_ctrl(nctrl); struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(req); - unsigned long flags; - int status; if (!blk_mq_request_started(req)) return; - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) { - ctrl->iocnt++; - op->flags |= FCOP_FLAGS_TERMIO; - } - spin_unlock_irqrestore(&ctrl->lock, flags); - - status = __nvme_fc_abort_op(ctrl, op); - if (status) { - /* - * if __nvme_fc_abort_op failed the io wasn't - * active. Thus this call path is running in - * parallel to the io complete. Treat as non-error. - */ - - /* back out the flags/counters */ - spin_lock_irqsave(&ctrl->lock, flags); - if (ctrl->flags & FCCTRL_TERMIO) - ctrl->iocnt--; - op->flags &= ~FCOP_FLAGS_TERMIO; - spin_unlock_irqrestore(&ctrl->lock, flags); - return; - } + __nvme_fc_abort_op(ctrl, op); } From c3aedd225f8bcc3b3e61df074bc045b80542b38a Mon Sep 17 00:00:00 2001 From: James Smart Date: Tue, 6 Feb 2018 06:48:30 -0800 Subject: [PATCH 022/149] nvme_fc: cleanup io completion There was some old cold that dealt with complete_rq being called prior to the lldd returning the io completion. This is garbage code. The complete_rq routine was being called after eh_timeouts were called and it was due to eh_timeouts not being handled properly. The timeouts were fixed in prior patches so that in general, a timeout will initiate an abort and the reset timer restarted as the abort operation will take care of completing things. Given the reset timer restarted, the erroneous complete_rq calls were eliminated. So remove the work that was synchronizing complete_rq with io completion. Reviewed-by: Johannes Thumshirn Signed-off-by: James Smart Signed-off-by: Sagi Grimberg --- drivers/nvme/host/fc.c | 63 ++++++++---------------------------------- 1 file changed, 12 insertions(+), 51 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 4673882ce152..7f51f8414b97 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -55,9 +55,7 @@ struct nvme_fc_queue { enum nvme_fcop_flags { FCOP_FLAGS_TERMIO = (1 << 0), - FCOP_FLAGS_RELEASED = (1 << 1), - FCOP_FLAGS_COMPLETE = (1 << 2), - FCOP_FLAGS_AEN = (1 << 3), + FCOP_FLAGS_AEN = (1 << 1), }; struct nvmefc_ls_req_op { @@ -1470,7 +1468,6 @@ nvme_fc_xmt_disconnect_assoc(struct nvme_fc_ctrl *ctrl) /* *********************** NVME Ctrl Routines **************************** */ -static void __nvme_fc_final_op_cleanup(struct request *rq); static void nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg); static int @@ -1544,25 +1541,20 @@ nvme_fc_abort_aen_ops(struct nvme_fc_ctrl *ctrl) __nvme_fc_abort_op(ctrl, aen_op); } -static inline int +static inline void __nvme_fc_fcpop_chk_teardowns(struct nvme_fc_ctrl *ctrl, struct nvme_fc_fcp_op *op, int opstate) { unsigned long flags; - bool complete_rq = false; - spin_lock_irqsave(&ctrl->lock, flags); - if (opstate == FCPOP_STATE_ABORTED && ctrl->flags & FCCTRL_TERMIO) { - if (!--ctrl->iocnt) - wake_up(&ctrl->ioabort_wait); + if (opstate == FCPOP_STATE_ABORTED) { + spin_lock_irqsave(&ctrl->lock, flags); + if (ctrl->flags & FCCTRL_TERMIO) { + if (!--ctrl->iocnt) + wake_up(&ctrl->ioabort_wait); + } + spin_unlock_irqrestore(&ctrl->lock, flags); } - if (op->flags & FCOP_FLAGS_RELEASED) - complete_rq = true; - else - op->flags |= FCOP_FLAGS_COMPLETE; - spin_unlock_irqrestore(&ctrl->lock, flags); - - return complete_rq; } static void @@ -1704,10 +1696,8 @@ done: ctrl->ctrl.state == NVME_CTRL_CONNECTING)) status |= cpu_to_le16(NVME_SC_DNR << 1); - if (__nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate)) - __nvme_fc_final_op_cleanup(rq); - else - nvme_end_request(rq, status, result); + __nvme_fc_fcpop_chk_teardowns(ctrl, op, opstate); + nvme_end_request(rq, status, result); check_error: if (terminate_assoc) @@ -2394,45 +2384,16 @@ nvme_fc_submit_async_event(struct nvme_ctrl *arg) } static void -__nvme_fc_final_op_cleanup(struct request *rq) +nvme_fc_complete_rq(struct request *rq) { struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); struct nvme_fc_ctrl *ctrl = op->ctrl; atomic_set(&op->state, FCPOP_STATE_IDLE); - op->flags &= ~(FCOP_FLAGS_RELEASED | FCOP_FLAGS_COMPLETE); nvme_fc_unmap_data(ctrl, rq, op); nvme_complete_rq(rq); nvme_fc_ctrl_put(ctrl); - -} - -static void -nvme_fc_complete_rq(struct request *rq) -{ - struct nvme_fc_fcp_op *op = blk_mq_rq_to_pdu(rq); - struct nvme_fc_ctrl *ctrl = op->ctrl; - unsigned long flags; - bool completed = false; - - /* - * the core layer, on controller resets after calling - * nvme_shutdown_ctrl(), calls complete_rq without our - * calling blk_mq_complete_request(), thus there may still - * be live i/o outstanding with the LLDD. Means transport has - * to track complete calls vs fcpio_done calls to know what - * path to take on completes and dones. - */ - spin_lock_irqsave(&ctrl->lock, flags); - if (op->flags & FCOP_FLAGS_COMPLETE) - completed = true; - else - op->flags |= FCOP_FLAGS_RELEASED; - spin_unlock_irqrestore(&ctrl->lock, flags); - - if (completed) - __nvme_fc_final_op_cleanup(rq); } /* From 79e902382637a2f421b7f295dcf9934d80d84d7d Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Fri, 9 Feb 2018 17:01:14 +0100 Subject: [PATCH 023/149] Documentation/locking/mutex-design: Update to reflect latest changes Commit 3ca0ff571b09 ("locking/mutex: Rework mutex::owner") reworked the basic mutex implementation to deal with several problems. Documentation was however left unchanged and became stale. Update mutex-design.txt to reflect changes introduced by the above commit. Signed-off-by: Juri Lelli Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/20180209160114.19980-1-juri.lelli@redhat.com [ Small readability tweaks to the text. ] Signed-off-by: Ingo Molnar --- Documentation/locking/mutex-design.txt | 49 +++++++++----------------- 1 file changed, 17 insertions(+), 32 deletions(-) diff --git a/Documentation/locking/mutex-design.txt b/Documentation/locking/mutex-design.txt index 60c482df1a38..818aca19612f 100644 --- a/Documentation/locking/mutex-design.txt +++ b/Documentation/locking/mutex-design.txt @@ -21,37 +21,23 @@ Implementation -------------- Mutexes are represented by 'struct mutex', defined in include/linux/mutex.h -and implemented in kernel/locking/mutex.c. These locks use a three -state atomic counter (->count) to represent the different possible -transitions that can occur during the lifetime of a lock: - - 1: unlocked - 0: locked, no waiters - negative: locked, with potential waiters - -In its most basic form it also includes a wait-queue and a spinlock -that serializes access to it. CONFIG_SMP systems can also include -a pointer to the lock task owner (->owner) as well as a spinner MCS -lock (->osq), both described below in (ii). +and implemented in kernel/locking/mutex.c. These locks use an atomic variable +(->owner) to keep track of the lock state during its lifetime. Field owner +actually contains 'struct task_struct *' to the current lock owner and it is +therefore NULL if not currently owned. Since task_struct pointers are aligned +at at least L1_CACHE_BYTES, low bits (3) are used to store extra state (e.g., +if waiter list is non-empty). In its most basic form it also includes a +wait-queue and a spinlock that serializes access to it. Furthermore, +CONFIG_MUTEX_SPIN_ON_OWNER=y systems use a spinner MCS lock (->osq), described +below in (ii). When acquiring a mutex, there are three possible paths that can be taken, depending on the state of the lock: -(i) fastpath: tries to atomically acquire the lock by decrementing the - counter. If it was already taken by another task it goes to the next - possible path. This logic is architecture specific. On x86-64, the - locking fastpath is 2 instructions: - - 0000000000000e10 : - e21: f0 ff 0b lock decl (%rbx) - e24: 79 08 jns e2e - - the unlocking fastpath is equally tight: - - 0000000000000bc0 : - bc8: f0 ff 07 lock incl (%rdi) - bcb: 7f 0a jg bd7 - +(i) fastpath: tries to atomically acquire the lock by cmpxchg()ing the owner with + the current task. This only works in the uncontended case (cmpxchg() checks + against 0UL, so all 3 state bits above have to be 0). If the lock is + contended it goes to the next possible path. (ii) midpath: aka optimistic spinning, tries to spin for acquisition while the lock owner is running and there are no other tasks ready @@ -143,11 +129,10 @@ Test if the mutex is taken: Disadvantages ------------- -Unlike its original design and purpose, 'struct mutex' is larger than -most locks in the kernel. E.g: on x86-64 it is 40 bytes, almost twice -as large as 'struct semaphore' (24 bytes) and tied, along with rwsems, -for the largest lock in the kernel. Larger structure sizes mean more -CPU cache and memory footprint. +Unlike its original design and purpose, 'struct mutex' is among the largest +locks in the kernel. E.g: on x86-64 it is 32 bytes, where 'struct semaphore' +is 24 bytes and rw_semaphore is 40 bytes. Larger structure sizes mean more CPU +cache and memory footprint. When to use mutexes ------------------- From 1f8ade92a83696986ad34438ce11e38975d1a43d Mon Sep 17 00:00:00 2001 From: Ulf Magnusson Date: Fri, 9 Feb 2018 00:15:36 +0100 Subject: [PATCH 024/149] ALSA: ac97: kconfig: Remove select of undefined symbol AC97 The AC97_BUS_NEW Kconfig symbol selects the globally undefined symbol AC97. Robert Jarzmik confirmed in https://lkml.org/lkml/2018/2/7/96 that the select was put in by mistake and can be safely removed, with no other changes required. Remove it. Fixes: 74426fbff66e ("ALSA: ac97: add an ac97 bus") Signed-off-by: Ulf Magnusson Signed-off-by: Takashi Iwai --- sound/ac97/Kconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/ac97/Kconfig b/sound/ac97/Kconfig index f8a64e15e5bf..baa5f8ef89d2 100644 --- a/sound/ac97/Kconfig +++ b/sound/ac97/Kconfig @@ -5,7 +5,6 @@ config AC97_BUS_NEW tristate - select AC97 help This is the new AC97 bus type, successor of AC97_BUS. The ported drivers which benefit from the AC97 automatic probing should "select" From 5e35dc0338d85ccebacf3f77eca1e5dea73155e8 Mon Sep 17 00:00:00 2001 From: Lassi Ylikojola Date: Fri, 9 Feb 2018 16:51:36 +0200 Subject: [PATCH 025/149] ALSA: usb-audio: add implicit fb quirk for Behringer UFX1204 Add quirk to ensure a sync endpoint is properly configured. This patch is a fix for same symptoms on Behringer UFX1204 as patch from Albertto Aquirre on Dec 8 2016 for Axe-Fx II. Signed-off-by: Lassi Ylikojola Cc: Signed-off-by: Takashi Iwai --- sound/usb/pcm.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/usb/pcm.c b/sound/usb/pcm.c index b9c9a19f9588..3cbfae6604f9 100644 --- a/sound/usb/pcm.c +++ b/sound/usb/pcm.c @@ -352,6 +352,15 @@ static int set_sync_ep_implicit_fb_quirk(struct snd_usb_substream *subs, ep = 0x86; iface = usb_ifnum_to_if(dev, 2); + if (!iface || iface->num_altsetting == 0) + return -EINVAL; + + alts = &iface->altsetting[1]; + goto add_sync_ep; + case USB_ID(0x1397, 0x0002): + ep = 0x81; + iface = usb_ifnum_to_if(dev, 1); + if (!iface || iface->num_altsetting == 0) return -EINVAL; From 2bda7141b89aa35308da69aac7f486fa81db73ba Mon Sep 17 00:00:00 2001 From: Matthias Lange Date: Wed, 31 Jan 2018 18:39:12 +0100 Subject: [PATCH 026/149] ALSA: ac97: Fix copy and paste typo in documentation It's 'optional' instead of 'optinal'. Signed-off-by: Matthias Lange Signed-off-by: Takashi Iwai --- include/sound/ac97/regs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sound/ac97/regs.h b/include/sound/ac97/regs.h index 4bb86d379bd5..9a4fa0c3264a 100644 --- a/include/sound/ac97/regs.h +++ b/include/sound/ac97/regs.h @@ -31,7 +31,7 @@ #define AC97_HEADPHONE 0x04 /* Headphone Volume (optional) */ #define AC97_MASTER_MONO 0x06 /* Master Volume Mono (optional) */ #define AC97_MASTER_TONE 0x08 /* Master Tone (Bass & Treble) (optional) */ -#define AC97_PC_BEEP 0x0a /* PC Beep Volume (optinal) */ +#define AC97_PC_BEEP 0x0a /* PC Beep Volume (optional) */ #define AC97_PHONE 0x0c /* Phone Volume (optional) */ #define AC97_MIC 0x0e /* MIC Volume */ #define AC97_LINE 0x10 /* Line In Volume */ From 447cae58cecd69392b74a4a42cd0ab9cabd816af Mon Sep 17 00:00:00 2001 From: Kirill Marinushkin Date: Mon, 29 Jan 2018 06:37:55 +0100 Subject: [PATCH 027/149] ALSA: usb-audio: Fix UAC2 get_ctl request with a RANGE attribute The layout of the UAC2 Control request and response varies depending on the request type. With the current implementation, only the Layout 2 Parameter Block (with the 2-byte sized RANGE attribute) is handled properly. For the Control requests with the 1-byte sized RANGE attribute (Bass Control, Mid Control, Tremble Control), the response is parsed incorrectly. This commit: * fixes the wLength field value in the request * fixes parsing the range values from the response Fixes: 23caaf19b11e ("ALSA: usb-mixer: Add support for Audio Class v2.0") Signed-off-by: Kirill Marinushkin Cc: Signed-off-by: Takashi Iwai --- sound/usb/mixer.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 9afb8ab524c7..06b22624ab7a 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -347,17 +347,20 @@ static int get_ctl_value_v2(struct usb_mixer_elem_info *cval, int request, int validx, int *value_ret) { struct snd_usb_audio *chip = cval->head.mixer->chip; - unsigned char buf[4 + 3 * sizeof(__u32)]; /* enough space for one range */ + /* enough space for one range */ + unsigned char buf[sizeof(__u16) + 3 * sizeof(__u32)]; unsigned char *val; - int idx = 0, ret, size; + int idx = 0, ret, val_size, size; __u8 bRequest; + val_size = uac2_ctl_value_size(cval->val_type); + if (request == UAC_GET_CUR) { bRequest = UAC2_CS_CUR; - size = uac2_ctl_value_size(cval->val_type); + size = val_size; } else { bRequest = UAC2_CS_RANGE; - size = sizeof(buf); + size = sizeof(__u16) + 3 * val_size; } memset(buf, 0, sizeof(buf)); @@ -390,16 +393,17 @@ error: val = buf + sizeof(__u16); break; case UAC_GET_MAX: - val = buf + sizeof(__u16) * 2; + val = buf + sizeof(__u16) + val_size; break; case UAC_GET_RES: - val = buf + sizeof(__u16) * 3; + val = buf + sizeof(__u16) + val_size * 2; break; default: return -EINVAL; } - *value_ret = convert_signed_value(cval, snd_usb_combine_bytes(val, sizeof(__u16))); + *value_ret = convert_signed_value(cval, + snd_usb_combine_bytes(val, val_size)); return 0; } From 7c74866baef1827e18f8269aec85030063520bd4 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Sun, 11 Feb 2018 09:50:27 -0400 Subject: [PATCH 028/149] ALSA: usb: add more device quirks for USB DSD devices Add some more devices that need quirks to handle DSD modes correctly. Signed-off-by: Daniel Mack Reported-and-tested-by: Thomas Gresens Cc: Signed-off-by: Takashi Iwai --- sound/usb/quirks.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index a66ef5777887..ea8f3de92fa4 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1363,8 +1363,11 @@ u64 snd_usb_interface_dsd_format_quirks(struct snd_usb_audio *chip, return SNDRV_PCM_FMTBIT_DSD_U32_BE; break; - /* Amanero Combo384 USB interface with native DSD support */ - case USB_ID(0x16d0, 0x071a): + /* Amanero Combo384 USB based DACs with native DSD support */ + case USB_ID(0x16d0, 0x071a): /* Amanero - Combo384 */ + case USB_ID(0x2ab6, 0x0004): /* T+A DAC8DSD-V2.0, MP1000E-V2.0, MP2000R-V2.0, MP2500R-V2.0, MP3100HV-V2.0 */ + case USB_ID(0x2ab6, 0x0005): /* T+A USB HD Audio 1 */ + case USB_ID(0x2ab6, 0x0006): /* T+A USB HD Audio 2 */ if (fp->altsetting == 2) { switch (le16_to_cpu(chip->dev->descriptor.bcdDevice)) { case 0x199: From 3cd091a773936c54344a519f7ee1379ccb620bee Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 9 Feb 2018 22:55:28 +0100 Subject: [PATCH 029/149] ACPI / EC: Restore polling during noirq suspend/resume phases Commit 662591461c4b (ACPI / EC: Drop EC noirq hooks to fix a regression) modified the ACPI EC driver so that it doesn't switch over to busy polling mode during noirq stages of system suspend and resume in an attempt to fix an issue resulting from that behavior. However, that modification introduced a system resume regression on Thinkpad X240, so make the EC driver switch over to the polling mode during noirq stages of system suspend and resume again, which effectively reverts the problematic commit. Fixes: 662591461c4b (ACPI / EC: Drop EC noirq hooks to fix a regression) Link: https://bugzilla.kernel.org/show_bug.cgi?id=197863 Reported-by: Markus Demleitner Tested-by: Markus Demleitner Signed-off-by: Rafael J. Wysocki --- drivers/acpi/ec.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index d9f38c645e4a..30a572956557 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -1927,6 +1927,9 @@ static int acpi_ec_suspend_noirq(struct device *dev) ec->reference_count >= 1) acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_DISABLE); + if (acpi_sleep_no_ec_events()) + acpi_ec_enter_noirq(ec); + return 0; } @@ -1934,6 +1937,9 @@ static int acpi_ec_resume_noirq(struct device *dev) { struct acpi_ec *ec = acpi_driver_data(to_acpi_device(dev)); + if (acpi_sleep_no_ec_events()) + acpi_ec_leave_noirq(ec); + if (ec_no_wakeup && test_bit(EC_FLAGS_STARTED, &ec->flags) && ec->reference_count >= 1) acpi_set_gpe(NULL, ec->gpe, ACPI_GPE_ENABLE); From 5a9e59e8d9dd9586d78c244b9d96fb18156daad3 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 9 Feb 2018 12:08:21 -0600 Subject: [PATCH 030/149] ACPI: SPCR: Mark expected switch fall-through in acpi_parse_spcr In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Addresses-Coverity-ID: 1465078 Signed-off-by: Gustavo A. R. Silva Signed-off-by: Rafael J. Wysocki --- drivers/acpi/spcr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/spcr.c b/drivers/acpi/spcr.c index 89e97d21a89c..9d52743080a4 100644 --- a/drivers/acpi/spcr.c +++ b/drivers/acpi/spcr.c @@ -115,6 +115,7 @@ int __init acpi_parse_spcr(bool enable_earlycon, bool enable_console) table->serial_port.access_width))) { default: pr_err("Unexpected SPCR Access Width. Defaulting to byte size\n"); + /* fall through */ case 8: iotype = "mmio"; break; From 4a823c0be80fa996234ebb41c80d40458b1bec1e Mon Sep 17 00:00:00 2001 From: Jia-Ju Bai Date: Fri, 26 Jan 2018 16:48:49 +0800 Subject: [PATCH 031/149] opp: cpu: Replace GFP_ATOMIC with GFP_KERNEL in dev_pm_opp_init_cpufreq_table After checking all possible call chains to dev_pm_opp_init_cpufreq_table() here, my tool finds that this function is never called in atomic context, namely never in an interrupt handler or holding a spinlock. And dev_pm_opp_init_cpufreq_table() calls dev_pm_opp_get_opp_count(), which calls mutex_lock that can sleep. It indicates that atmtcp_v_send() can call functions which may sleep. Thus GFP_ATOMIC is not necessary, and it can be replaced with GFP_KERNEL. This is found by a static analysis tool named DCNS written by myself. Signed-off-by: Jia-Ju Bai Signed-off-by: Viresh Kumar --- drivers/opp/cpu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/opp/cpu.c b/drivers/opp/cpu.c index 2d87bc1adf38..0c0910709435 100644 --- a/drivers/opp/cpu.c +++ b/drivers/opp/cpu.c @@ -55,7 +55,7 @@ int dev_pm_opp_init_cpufreq_table(struct device *dev, if (max_opps <= 0) return max_opps ? max_opps : -ENODATA; - freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_ATOMIC); + freq_table = kcalloc((max_opps + 1), sizeof(*freq_table), GFP_KERNEL); if (!freq_table) return -ENOMEM; From 4222f38ca3b7ae30ace582077677cec8b88fac36 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Feb 2018 17:38:33 +0200 Subject: [PATCH 032/149] ACPI / bus: Do not traverse through non-existed device table When __acpi_match_device() is called it would be possible to have ACPI ID table a NULL pointer. To avoid potential dereference, check for this before traverse. While here, remove redundant 'else'. Note, this patch implies a bit of refactoring acpi_of_match_device() to return pointer to OF ID when matched followed by refactoring __acpi_match_device() to return either ACPI or OF ID when matches. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 63 ++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 25 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index 676c9788e1c8..f1384e107eed 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -660,13 +660,15 @@ struct acpi_device *acpi_companion_match(const struct device *dev) * acpi_of_match_device - Match device object using the "compatible" property. * @adev: ACPI device object to match. * @of_match_table: List of device IDs to match against. + * @of_id: OF ID if matched * * If @dev has an ACPI companion which has ACPI_DT_NAMESPACE_HID in its list of * identifiers and a _DSD object with the "compatible" property, use that * property to match against the given list of identifiers. */ static bool acpi_of_match_device(struct acpi_device *adev, - const struct of_device_id *of_match_table) + const struct of_device_id *of_match_table, + const struct of_device_id **of_id) { const union acpi_object *of_compatible, *obj; int i, nval; @@ -690,8 +692,11 @@ static bool acpi_of_match_device(struct acpi_device *adev, const struct of_device_id *id; for (id = of_match_table; id->compatible[0]; id++) - if (!strcasecmp(obj->string.pointer, id->compatible)) + if (!strcasecmp(obj->string.pointer, id->compatible)) { + if (of_id) + *of_id = id; return true; + } } return false; @@ -762,10 +767,11 @@ static bool __acpi_match_device_cls(const struct acpi_device_id *id, return true; } -static const struct acpi_device_id *__acpi_match_device( - struct acpi_device *device, - const struct acpi_device_id *ids, - const struct of_device_id *of_ids) +static bool __acpi_match_device(struct acpi_device *device, + const struct acpi_device_id *acpi_ids, + const struct of_device_id *of_ids, + const struct acpi_device_id **acpi_id, + const struct of_device_id **of_id) { const struct acpi_device_id *id; struct acpi_hardware_id *hwid; @@ -775,30 +781,32 @@ static const struct acpi_device_id *__acpi_match_device( * driver for it. */ if (!device || !device->status.present) - return NULL; + return false; list_for_each_entry(hwid, &device->pnp.ids, list) { /* First, check the ACPI/PNP IDs provided by the caller. */ - for (id = ids; id->id[0] || id->cls; id++) { - if (id->id[0] && !strcmp((char *) id->id, hwid->id)) - return id; - else if (id->cls && __acpi_match_device_cls(id, hwid)) - return id; + if (acpi_ids) { + for (id = acpi_ids; id->id[0] || id->cls; id++) { + if (id->id[0] && !strcmp((char *)id->id, hwid->id)) + goto out_acpi_match; + if (id->cls && __acpi_match_device_cls(id, hwid)) + goto out_acpi_match; + } } /* * Next, check ACPI_DT_NAMESPACE_HID and try to match the * "compatible" property if found. - * - * The id returned by the below is not valid, but the only - * caller passing non-NULL of_ids here is only interested in - * whether or not the return value is NULL. */ - if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id) - && acpi_of_match_device(device, of_ids)) - return id; + if (!strcmp(ACPI_DT_NAMESPACE_HID, hwid->id)) + return acpi_of_match_device(device, of_ids, of_id); } - return NULL; + return false; + +out_acpi_match: + if (acpi_id) + *acpi_id = id; + return true; } /** @@ -815,7 +823,10 @@ static const struct acpi_device_id *__acpi_match_device( const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev) { - return __acpi_match_device(acpi_companion_match(dev), ids, NULL); + const struct acpi_device_id *id = NULL; + + __acpi_match_device(acpi_companion_match(dev), ids, NULL, &id, NULL); + return id; } EXPORT_SYMBOL_GPL(acpi_match_device); @@ -840,7 +851,7 @@ EXPORT_SYMBOL_GPL(acpi_get_match_data); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids) { - return __acpi_match_device(device, ids, NULL) ? 0 : -ENOENT; + return __acpi_match_device(device, ids, NULL, NULL, NULL) ? 0 : -ENOENT; } EXPORT_SYMBOL(acpi_match_device_ids); @@ -849,10 +860,12 @@ bool acpi_driver_match_device(struct device *dev, { if (!drv->acpi_match_table) return acpi_of_match_device(ACPI_COMPANION(dev), - drv->of_match_table); + drv->of_match_table, + NULL); - return !!__acpi_match_device(acpi_companion_match(dev), - drv->acpi_match_table, drv->of_match_table); + return __acpi_match_device(acpi_companion_match(dev), + drv->acpi_match_table, drv->of_match_table, + NULL, NULL); } EXPORT_SYMBOL_GPL(acpi_driver_match_device); From 8ff277c5bf87d750a44a656d4f113462493acbfc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Feb 2018 17:38:34 +0200 Subject: [PATCH 033/149] ACPI / bus: Remove checks in acpi_get_match_data() As well as its sibling of_device_get_match_data() has no such checks, no need to do it in acpi_get_match_data(). First of all, we are not supposed to call fwnode API like this without driver attached. Second, since __acpi_match_device() does check input parameter there is no need to duplicate it outside. And last but not least one, the API should still serve the cases when ACPI device is enumerated via PRP0001. In such case driver has neither ACPI table nor driver data there. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index f1384e107eed..ca4af098b1bf 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -834,12 +834,6 @@ void *acpi_get_match_data(const struct device *dev) { const struct acpi_device_id *match; - if (!dev->driver) - return NULL; - - if (!dev->driver->acpi_match_table) - return NULL; - match = acpi_match_device(dev->driver->acpi_match_table, dev); if (!match) return NULL; From 29d5325a14ab49375476e3a6442ff40a008a8c9a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Feb 2018 17:38:35 +0200 Subject: [PATCH 034/149] ACPI / bus: Rename acpi_get_match_data() to acpi_device_get_match_data() Do the renaming to be consistent with its sibling, i.e. of_device_get_match_data(). No functional change. Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 4 ++-- drivers/acpi/property.c | 2 +- include/linux/acpi.h | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index ca4af098b1bf..e6285b5ce0d5 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -830,7 +830,7 @@ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, } EXPORT_SYMBOL_GPL(acpi_match_device); -void *acpi_get_match_data(const struct device *dev) +void *acpi_device_get_match_data(const struct device *dev) { const struct acpi_device_id *match; @@ -840,7 +840,7 @@ void *acpi_get_match_data(const struct device *dev) return (void *)match->driver_data; } -EXPORT_SYMBOL_GPL(acpi_get_match_data); +EXPORT_SYMBOL_GPL(acpi_device_get_match_data); int acpi_match_device_ids(struct acpi_device *device, const struct acpi_device_id *ids) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 466d1503aba0..f9b5fa230a86 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1275,7 +1275,7 @@ static void * acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { - return acpi_get_match_data(dev); + return acpi_device_get_match_data(dev); } #define DECLARE_ACPI_FWNODE_OPS(ops) \ diff --git a/include/linux/acpi.h b/include/linux/acpi.h index 64e10746f282..bdf47e0f92e9 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -587,7 +587,7 @@ extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev); -void *acpi_get_match_data(const struct device *dev); +void *acpi_device_get_match_data(const struct device *dev); extern bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv); int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); @@ -766,7 +766,7 @@ static inline const struct acpi_device_id *acpi_match_device( return NULL; } -static inline void *acpi_get_match_data(const struct device *dev) +static inline void *acpi_device_get_match_data(const struct device *dev) { return NULL; } From 67dcc26d208ca5578f08c3c78cb254418c24e9ec Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 9 Feb 2018 17:38:36 +0200 Subject: [PATCH 035/149] device property: Constify device_get_match_data() Constify device_get_match_data() as OF and ACPI variants return constant value. Acked-by: Sakari Ailus Signed-off-by: Andy Shevchenko Reviewed-by: Mika Westerberg Signed-off-by: Rafael J. Wysocki --- drivers/acpi/bus.c | 4 ++-- drivers/acpi/property.c | 2 +- drivers/base/property.c | 5 ++--- drivers/of/property.c | 4 ++-- include/linux/acpi.h | 4 ++-- include/linux/fwnode.h | 4 ++-- include/linux/property.h | 2 +- 7 files changed, 12 insertions(+), 13 deletions(-) diff --git a/drivers/acpi/bus.c b/drivers/acpi/bus.c index e6285b5ce0d5..0dad0bd9327b 100644 --- a/drivers/acpi/bus.c +++ b/drivers/acpi/bus.c @@ -830,7 +830,7 @@ const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, } EXPORT_SYMBOL_GPL(acpi_match_device); -void *acpi_device_get_match_data(const struct device *dev) +const void *acpi_device_get_match_data(const struct device *dev) { const struct acpi_device_id *match; @@ -838,7 +838,7 @@ void *acpi_device_get_match_data(const struct device *dev) if (!match) return NULL; - return (void *)match->driver_data; + return (const void *)match->driver_data; } EXPORT_SYMBOL_GPL(acpi_device_get_match_data); diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index f9b5fa230a86..5815356ea6ad 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -1271,7 +1271,7 @@ static int acpi_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, return 0; } -static void * +static const void * acpi_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { diff --git a/drivers/base/property.c b/drivers/base/property.c index 302236281d83..8f205f6461ed 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -1410,9 +1410,8 @@ int fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, } EXPORT_SYMBOL(fwnode_graph_parse_endpoint); -void *device_get_match_data(struct device *dev) +const void *device_get_match_data(struct device *dev) { - return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, - dev); + return fwnode_call_ptr_op(dev_fwnode(dev), device_get_match_data, dev); } EXPORT_SYMBOL_GPL(device_get_match_data); diff --git a/drivers/of/property.c b/drivers/of/property.c index 36ed84e26d9c..f46828e3b082 100644 --- a/drivers/of/property.c +++ b/drivers/of/property.c @@ -977,11 +977,11 @@ static int of_fwnode_graph_parse_endpoint(const struct fwnode_handle *fwnode, return 0; } -static void * +static const void * of_fwnode_device_get_match_data(const struct fwnode_handle *fwnode, const struct device *dev) { - return (void *)of_device_get_match_data(dev); + return of_device_get_match_data(dev); } const struct fwnode_operations of_fwnode_ops = { diff --git a/include/linux/acpi.h b/include/linux/acpi.h index bdf47e0f92e9..968173ec2726 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -587,7 +587,7 @@ extern int acpi_nvs_for_each_region(int (*func)(__u64, __u64, void *), const struct acpi_device_id *acpi_match_device(const struct acpi_device_id *ids, const struct device *dev); -void *acpi_device_get_match_data(const struct device *dev); +const void *acpi_device_get_match_data(const struct device *dev); extern bool acpi_driver_match_device(struct device *dev, const struct device_driver *drv); int acpi_device_uevent_modalias(struct device *, struct kobj_uevent_env *); @@ -766,7 +766,7 @@ static inline const struct acpi_device_id *acpi_match_device( return NULL; } -static inline void *acpi_device_get_match_data(const struct device *dev) +static inline const void *acpi_device_get_match_data(const struct device *dev) { return NULL; } diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 4fa1a489efe4..4fe8f289b3f6 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -73,8 +73,8 @@ struct fwnode_operations { struct fwnode_handle *(*get)(struct fwnode_handle *fwnode); void (*put)(struct fwnode_handle *fwnode); bool (*device_is_available)(const struct fwnode_handle *fwnode); - void *(*device_get_match_data)(const struct fwnode_handle *fwnode, - const struct device *dev); + const void *(*device_get_match_data)(const struct fwnode_handle *fwnode, + const struct device *dev); bool (*property_present)(const struct fwnode_handle *fwnode, const char *propname); int (*property_read_int_array)(const struct fwnode_handle *fwnode, diff --git a/include/linux/property.h b/include/linux/property.h index 769d372c1edf..2eea4b310fc2 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -283,7 +283,7 @@ bool device_dma_supported(struct device *dev); enum dev_dma_attr device_get_dma_attr(struct device *dev); -void *device_get_match_data(struct device *dev); +const void *device_get_match_data(struct device *dev); int device_get_phy_mode(struct device *dev); From ea56fb282368ea08c2a313af6b55cb597aec4db1 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Fri, 9 Feb 2018 13:21:42 +0100 Subject: [PATCH 036/149] mtd: nand: vf610: set correct ooblayout With commit 3cf32d180227 ("mtd: nand: vf610: switch to mtd_ooblayout_ops") the driver started to use the NAND cores default large page ooblayout. However, shortly after commit 6a623e076944 ("mtd: nand: add ooblayout for old hamming layout") changed the default layout to the old hamming layout, which is not what vf610_nfc is using. Specify the default large page layout explicitly. Fixes: 6a623e076944 ("mtd: nand: add ooblayout for old hamming layout") Cc: # v4.12+ Signed-off-by: Stefan Agner Signed-off-by: Boris Brezillon --- drivers/mtd/nand/vf610_nfc.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/nand/vf610_nfc.c b/drivers/mtd/nand/vf610_nfc.c index 80d31a58e558..f367144f3c6f 100644 --- a/drivers/mtd/nand/vf610_nfc.c +++ b/drivers/mtd/nand/vf610_nfc.c @@ -752,10 +752,8 @@ static int vf610_nfc_probe(struct platform_device *pdev) if (mtd->oobsize > 64) mtd->oobsize = 64; - /* - * mtd->ecclayout is not specified here because we're using the - * default large page ECC layout defined in NAND core. - */ + /* Use default large page ECC layout defined in NAND core */ + mtd_set_ooblayout(mtd, &nand_ooblayout_lp_ops); if (chip->ecc.strength == 32) { nfc->ecc_mode = ECC_60_BYTE; chip->ecc.bytes = 60; From f23def8038611fa362de345c540107c78edaa085 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 30 Jan 2018 14:23:21 +0100 Subject: [PATCH 037/149] mtd: nand: MTD_NAND_MARVELL should depend on HAS_DMA If NO_DMA=y: ERROR: "bad_dma_ops" [drivers/mtd/nand/marvell_nand.ko] undefined! Add a dependency on HAS_DMA to fix this. Fixes: 02f26ecf8c772751 ("mtd: nand: add reworked Marvell NAND controller driver") Signed-off-by: Geert Uytterhoeven Acked-by: Miquel Raynal Signed-off-by: Boris Brezillon --- drivers/mtd/nand/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index e6b8c59f2c0d..736ac887303c 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -328,7 +328,7 @@ config MTD_NAND_MARVELL tristate "NAND controller support on Marvell boards" depends on PXA3xx || ARCH_MMP || PLAT_ORION || ARCH_MVEBU || \ COMPILE_TEST - depends on HAS_IOMEM + depends on HAS_IOMEM && HAS_DMA help This enables the NAND flash controller driver for Marvell boards, including: From b6d8ef86cb7b8b6920b6815ebf1352757d3adb87 Mon Sep 17 00:00:00 2001 From: Aishwarya Pant Date: Wed, 7 Feb 2018 19:04:36 +0530 Subject: [PATCH 038/149] Documentation/ABI: update cpuidle sysfs documentation Update cpuidle documentation using git logs and existing documentation in Documentation/cpuidle/sysfs.txt. This might be useful for scripting and tracking changes in the ABI. Signed-off-by: Aishwarya Pant Signed-off-by: Rafael J. Wysocki --- .../ABI/testing/sysfs-devices-system-cpu | 77 ++++++++++++++++++- 1 file changed, 75 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index bfd29bc8d37a..4ed63b6cfb15 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -108,6 +108,8 @@ Description: CPU topology files that describe a logical CPU's relationship What: /sys/devices/system/cpu/cpuidle/current_driver /sys/devices/system/cpu/cpuidle/current_governer_ro + /sys/devices/system/cpu/cpuidle/available_governors + /sys/devices/system/cpu/cpuidle/current_governor Date: September 2007 Contact: Linux kernel mailing list Description: Discover cpuidle policy and mechanism @@ -119,13 +121,84 @@ Description: Discover cpuidle policy and mechanism Idle policy (governor) is differentiated from idle mechanism (driver) - current_driver: displays current idle mechanism + current_driver: (RO) displays current idle mechanism - current_governor_ro: displays current idle policy + current_governor_ro: (RO) displays current idle policy + + With the cpuidle_sysfs_switch boot option enabled (meant for + developer testing), the following three attributes are visible + instead: + + current_driver: same as described above + + available_governors: (RO) displays a space separated list of + available governors + + current_governor: (RW) displays current idle policy. Users can + switch the governor at runtime by writing to this file. See files in Documentation/cpuidle/ for more information. +What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/name + /sys/devices/system/cpu/cpuX/cpuidle/stateN/latency + /sys/devices/system/cpu/cpuX/cpuidle/stateN/power + /sys/devices/system/cpu/cpuX/cpuidle/stateN/time + /sys/devices/system/cpu/cpuX/cpuidle/stateN/usage +Date: September 2007 +KernelVersion: v2.6.24 +Contact: Linux power management list +Description: + The directory /sys/devices/system/cpu/cpuX/cpuidle contains per + logical CPU specific cpuidle information for each online cpu X. + The processor idle states which are available for use have the + following attributes: + + name: (RO) Name of the idle state (string). + + latency: (RO) The latency to exit out of this idle state (in + microseconds). + + power: (RO) The power consumed while in this idle state (in + milliwatts). + + time: (RO) The total time spent in this idle state (in microseconds). + + usage: (RO) Number of times this state was entered (a count). + + +What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/desc +Date: February 2008 +KernelVersion: v2.6.25 +Contact: Linux power management list +Description: + (RO) A small description about the idle state (string). + + +What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/disable +Date: March 2012 +KernelVersion: v3.10 +Contact: Linux power management list +Description: + (RW) Option to disable this idle state (bool). The behavior and + the effect of the disable variable depends on the implementation + of a particular governor. In the ladder governor, for example, + it is not coherent, i.e. if one is disabling a light state, then + all deeper states are disabled as well, but the disable variable + does not reflect it. Likewise, if one enables a deep state but a + lighter state still is disabled, then this has no effect. + + +What: /sys/devices/system/cpu/cpuX/cpuidle/stateN/residency +Date: March 2014 +KernelVersion: v3.15 +Contact: Linux power management list +Description: + (RO) Display the target residency i.e. the minimum amount of + time (in microseconds) this cpu should spend in this idle state + to make the transition worth the effort. + + What: /sys/devices/system/cpu/cpu#/cpufreq/* Date: pre-git history Contact: linux-pm@vger.kernel.org From 69728051f5bf15efaf6edfbcfe1b5a49a2437918 Mon Sep 17 00:00:00 2001 From: Tony Lindgren Date: Fri, 9 Feb 2018 08:11:26 -0800 Subject: [PATCH 039/149] PM / wakeirq: Fix unbalanced IRQ enable for wakeirq If a device is runtime PM suspended when we enter suspend and has a dedicated wake IRQ, we can get the following warning: WARNING: CPU: 0 PID: 108 at kernel/irq/manage.c:526 enable_irq+0x40/0x94 [ 102.087860] Unbalanced enable for IRQ 147 ... (enable_irq) from [] (dev_pm_arm_wake_irq+0x4c/0x60) (dev_pm_arm_wake_irq) from [] (device_wakeup_arm_wake_irqs+0x58/0x9c) (device_wakeup_arm_wake_irqs) from [] (dpm_suspend_noirq+0x10/0x48) (dpm_suspend_noirq) from [] (suspend_devices_and_enter+0x30c/0xf14) (suspend_devices_and_enter) from [] (enter_state+0xad4/0xbd8) (enter_state) from [] (pm_suspend+0x38/0x98) (pm_suspend) from [] (state_store+0x68/0xc8) This is because the dedicated wake IRQ for the device may have been already enabled earlier by dev_pm_enable_wake_irq_check(). Fix the issue by checking for runtime PM suspended status. This issue can be easily reproduced by setting serial console log level to zero, letting the serial console idle, and suspend the system from an ssh terminal. On resume, dmesg will have the warning above. The reason why I have not run into this issue earlier has been that I typically run my PM test cases from on a serial console instead over ssh. Fixes: c84345597558 (PM / wakeirq: Enable dedicated wakeirq for suspend) Signed-off-by: Tony Lindgren Signed-off-by: Rafael J. Wysocki --- drivers/base/power/wakeirq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/base/power/wakeirq.c b/drivers/base/power/wakeirq.c index a8ac86e4d79e..6637fc319269 100644 --- a/drivers/base/power/wakeirq.c +++ b/drivers/base/power/wakeirq.c @@ -321,7 +321,8 @@ void dev_pm_arm_wake_irq(struct wake_irq *wirq) return; if (device_may_wakeup(wirq->dev)) { - if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) + if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED && + !pm_runtime_status_suspended(wirq->dev)) enable_irq(wirq->irq); enable_irq_wake(wirq->irq); @@ -343,7 +344,8 @@ void dev_pm_disarm_wake_irq(struct wake_irq *wirq) if (device_may_wakeup(wirq->dev)) { disable_irq_wake(wirq->irq); - if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED) + if (wirq->status & WAKE_IRQ_DEDICATED_ALLOCATED && + !pm_runtime_status_suspended(wirq->dev)) disable_irq_nosync(wirq->irq); } } From 433986c2c265d106d6a8e88006e0131fefc92b7b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Sat, 10 Feb 2018 19:13:58 +0100 Subject: [PATCH 040/149] PM / runtime: Update links_count also if !CONFIG_SRCU Commit baa8809f6097 (PM / runtime: Optimize the use of device links) added an invocation of pm_runtime_drop_link() to __device_link_del(). However there are two variants of that function, one for CONFIG_SRCU and another for !CONFIG_SRCU, and the commit only modified the former. Fixes: baa8809f6097 (PM / runtime: Optimize the use of device links) Cc: v4.10+ # v4.10+ Signed-off-by: Lukas Wunner Signed-off-by: Rafael J. Wysocki --- drivers/base/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/base/core.c b/drivers/base/core.c index b2261f92f2f1..5847364f25d9 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -310,6 +310,9 @@ static void __device_link_del(struct device_link *link) dev_info(link->consumer, "Dropping the link to %s\n", dev_name(link->supplier)); + if (link->flags & DL_FLAG_PM_RUNTIME) + pm_runtime_drop_link(link->consumer); + list_del(&link->s_node); list_del(&link->c_node); device_link_free(link); From 6b4af818c7d7a35a861c94596e05e43596e5fd28 Mon Sep 17 00:00:00 2001 From: Aishwarya Pant Date: Sat, 10 Feb 2018 14:27:19 +0530 Subject: [PATCH 041/149] ACPI / DPTF: Document dptf_power sysfs atttributes The descriptions have been collected from git commit logs and reading through code. Signed-off-by: Aishwarya Pant Signed-off-by: Rafael J. Wysocki --- Documentation/ABI/testing/sysfs-platform-dptf | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-platform-dptf diff --git a/Documentation/ABI/testing/sysfs-platform-dptf b/Documentation/ABI/testing/sysfs-platform-dptf new file mode 100644 index 000000000000..325dc0667dbb --- /dev/null +++ b/Documentation/ABI/testing/sysfs-platform-dptf @@ -0,0 +1,40 @@ +What: /sys/bus/platform/devices/INT3407:00/dptf_power/charger_type +Date: Jul, 2016 +KernelVersion: v4.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) The charger type - Traditional, Hybrid or NVDC. + +What: /sys/bus/platform/devices/INT3407:00/dptf_power/adapter_rating_mw +Date: Jul, 2016 +KernelVersion: v4.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Adapter rating in milliwatts (the maximum Adapter power). + Must be 0 if no AC Adaptor is plugged in. + +What: /sys/bus/platform/devices/INT3407:00/dptf_power/max_platform_power_mw +Date: Jul, 2016 +KernelVersion: v4.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Maximum platform power that can be supported by the battery + in milliwatts. + +What: /sys/bus/platform/devices/INT3407:00/dptf_power/platform_power_source +Date: Jul, 2016 +KernelVersion: v4.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Display the platform power source + 0x00 = DC + 0x01 = AC + 0x02 = USB + 0x03 = Wireless Charger + +What: /sys/bus/platform/devices/INT3407:00/dptf_power/battery_steady_power +Date: Jul, 2016 +KernelVersion: v4.10 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) The maximum sustained power for battery in milliwatts. From 22029845ad81033115910cdef35170de6a10a1eb Mon Sep 17 00:00:00 2001 From: Aishwarya Pant Date: Sat, 10 Feb 2018 14:27:38 +0530 Subject: [PATCH 042/149] ACPI: dock: document sysfs interface Description has been collected from git commit history and reading through code. Signed-off-by: Aishwarya Pant Signed-off-by: Rafael J. Wysocki --- .../ABI/testing/sysfs-devices-platform-dock | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-platform-dock diff --git a/Documentation/ABI/testing/sysfs-devices-platform-dock b/Documentation/ABI/testing/sysfs-devices-platform-dock new file mode 100644 index 000000000000..1d8c18f905c7 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-platform-dock @@ -0,0 +1,39 @@ +What: /sys/devices/platform/dock.N/docked +Date: Dec, 2006 +KernelVersion: 2.6.19 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Value 1 or 0 indicates whether the software believes the + laptop is docked in a docking station. + +What: /sys/devices/platform/dock.N/undock +Date: Dec, 2006 +KernelVersion: 2.6.19 +Contact: linux-acpi@vger.kernel.org +Description: + (WO) Writing to this file causes the software to initiate an + undock request to the firmware. + +What: /sys/devices/platform/dock.N/uid +Date: Feb, 2007 +KernelVersion: v2.6.21 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Displays the docking station the laptop is docked to. + +What: /sys/devices/platform/dock.N/flags +Date: May, 2007 +KernelVersion: v2.6.21 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Show dock station flags, useful for checking if undock + request has been made by the user (from the immediate_undock + option). + +What: /sys/devices/platform/dock.N/type +Date: Aug, 2008 +KernelVersion: v2.6.27 +Contact: linux-acpi@vger.kernel.org +Description: + (RO) Display the dock station type- dock_station, ata_bay or + battery_bay. From d7212cfb05ba802bea4dd6c90d61cfe6366ea224 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 12 Feb 2018 11:34:22 +0100 Subject: [PATCH 043/149] PM: cpuidle: Fix cpuidle_poll_state_init() prototype Commit f85942207516 (x86: PM: Make APM idle driver initialize polling state) made apm_init() call cpuidle_poll_state_init(), but that only is defined for CONFIG_CPU_IDLE set, so make the empty stub of it available for CONFIG_CPU_IDLE unset too to fix the resulting build issue. Fixes: f85942207516 (x86: PM: Make APM idle driver initialize polling state) Cc: 4.14+ # 4.14+ Signed-off-by: Rafael J. Wysocki --- include/linux/cpuidle.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index 8f7788d23b57..a6989e02d0a0 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -225,7 +225,7 @@ static inline void cpuidle_coupled_parallel_barrier(struct cpuidle_device *dev, } #endif -#ifdef CONFIG_ARCH_HAS_CPU_RELAX +#if defined(CONFIG_CPU_IDLE) && defined(CONFIG_ARCH_HAS_CPU_RELAX) void cpuidle_poll_state_init(struct cpuidle_driver *drv); #else static inline void cpuidle_poll_state_init(struct cpuidle_driver *drv) {} From 16e574d762ac5512eb922ac0ac5eed360b7db9d8 Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Sun, 11 Feb 2018 19:16:15 -0600 Subject: [PATCH 044/149] arm64: Add missing Falkor part number for branch predictor hardening References to CPU part number MIDR_QCOM_FALKOR were dropped from the mailing list patch due to mainline/arm64 branch dependency. So this patch adds the missing part number. Fixes: ec82b567a74f ("arm64: Implement branch predictor hardening for Falkor") Acked-by: Marc Zyngier Signed-off-by: Shanker Donthineni Signed-off-by: Catalin Marinas --- arch/arm64/kernel/cpu_errata.c | 9 +++++++++ arch/arm64/kvm/hyp/switch.c | 4 +++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index 07823595b7f0..52f15cd896e1 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -406,6 +406,15 @@ const struct arm64_cpu_capabilities arm64_errata[] = { .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), + .enable = qcom_enable_link_stack_sanitization, + }, + { + .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), + }, { .capability = ARM64_HARDEN_BRANCH_PREDICTOR, MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 116252a8d3a5..870f4b1587f9 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -407,8 +407,10 @@ again: u32 midr = read_cpuid_id(); /* Apply BTAC predictors mitigation to all Falkor chips */ - if ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1) + if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || + ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) { __qcom_hyp_sanitize_btac_predictors(); + } } fp_enabled = __fpsimd_enabled(); From 9d37c094dacda531ac3e529dd4dd139e3c0b7811 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Thu, 8 Feb 2018 19:39:20 +0000 Subject: [PATCH 045/149] ia64: fix build failure with CONFIG_SWIOTLB arch/ia64/kernel/pci-swiotlb.c is removed in commit 4fac8076df85 ("ia64: clean up swiotlb support") but pci-swiotlb.o is still present in Makefile, and so build fail when CONFIG_SWIOTLB is enabled. Fix the build failure by removing pci-swiotlb.o from makefile Fixes: 4fac8076df85 ("ia64: clean up swiotlb support") Signed-off-by: Corentin Labbe Signed-off-by: Christoph Hellwig --- arch/ia64/kernel/Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile index 0b4c65a1af25..498f3da3f225 100644 --- a/arch/ia64/kernel/Makefile +++ b/arch/ia64/kernel/Makefile @@ -41,7 +41,6 @@ ifneq ($(CONFIG_IA64_ESI),) obj-y += esi_stub.o # must be in kernel proper endif obj-$(CONFIG_INTEL_IOMMU) += pci-dma.o -obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o obj-$(CONFIG_BINFMT_ELF) += elfcore.o From f25e6f6b4eae7e25e92e91a570cae84bf83e751a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Feb 2018 09:21:07 +0100 Subject: [PATCH 046/149] dma-direct: mark as is_phys Various PCI_DMA_BUS_IS_PHYS implementations rely on this flag to make proper decisions for block and networking addressability. Signed-off-by: Christoph Hellwig --- lib/dma-direct.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/dma-direct.c b/lib/dma-direct.c index 40b1f92f2214..fdc733cf9e30 100644 --- a/lib/dma-direct.c +++ b/lib/dma-direct.c @@ -152,5 +152,6 @@ const struct dma_map_ops dma_direct_ops = { .map_sg = dma_direct_map_sg, .dma_supported = dma_direct_supported, .mapping_error = dma_direct_mapping_error, + .is_phys = 1, }; EXPORT_SYMBOL(dma_direct_ops); From 42ed64524d846b96afaa8b3f9ba045bcaf11ab0f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 2 Feb 2018 09:51:14 +0100 Subject: [PATCH 047/149] dma-direct: comment the dma_direct_free calling convention Signed-off-by: Christoph Hellwig --- lib/dma-direct.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/dma-direct.c b/lib/dma-direct.c index fdc733cf9e30..c9e8e21cb334 100644 --- a/lib/dma-direct.c +++ b/lib/dma-direct.c @@ -84,6 +84,10 @@ again: return page_address(page); } +/* + * NOTE: this function must never look at the dma_addr argument, because we want + * to be able to use it as a helper for iommu implementations as well. + */ void dma_direct_free(struct device *dev, size_t size, void *cpu_addr, dma_addr_t dma_addr, unsigned long attrs) { From ecc2dc55ce79945c2e0a04977706a99dc4848229 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 10 Feb 2018 09:43:49 +0100 Subject: [PATCH 048/149] dma-mapping: fix a comment typo Reported-by: Randy Dunlap Signed-off-by: Christoph Hellwig --- include/linux/dma-mapping.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 34fe8463d10e..eb9eab4ecd6d 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -578,7 +578,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr) /* * This is a hack for the legacy x86 forbid_dac and iommu_sac_force. Please - * don't use this is new code. + * don't use this in new code. */ #ifndef arch_dma_supported #define arch_dma_supported(dev, mask) (1) From 0a34e4668c508cbbc2d5ef2d9710b145e4c0b27d Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 11 Jan 2018 13:38:15 -0800 Subject: [PATCH 049/149] nvme: Don't use a stack buffer for keep-alive command In nvme_keep_alive() we pass a request with a pointer to an NVMe command on the stack into blk_execute_rq_nowait(). However, the block layer doesn't guarantee that the request is fully queued before blk_execute_rq_nowait() returns. If not, and the request is queued after nvme_keep_alive() returns, then we'll end up using stack memory that might have been overwritten to form the NVMe command we pass to hardware. Fix this by keeping a special command struct in the nvme_ctrl struct right next to the delayed work struct used for keep-alives. Signed-off-by: Roland Dreier Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 8 +++----- drivers/nvme/host/nvme.h | 1 + 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2fd8688cfa47..6d0490b477c9 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -796,13 +796,9 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status) static int nvme_keep_alive(struct nvme_ctrl *ctrl) { - struct nvme_command c; struct request *rq; - memset(&c, 0, sizeof(c)); - c.common.opcode = nvme_admin_keep_alive; - - rq = nvme_alloc_request(ctrl->admin_q, &c, BLK_MQ_REQ_RESERVED, + rq = nvme_alloc_request(ctrl->admin_q, &ctrl->ka_cmd, BLK_MQ_REQ_RESERVED, NVME_QID_ANY); if (IS_ERR(rq)) return PTR_ERR(rq); @@ -834,6 +830,8 @@ void nvme_start_keep_alive(struct nvme_ctrl *ctrl) return; INIT_DELAYED_WORK(&ctrl->ka_work, nvme_keep_alive_work); + memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd)); + ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive; schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); } EXPORT_SYMBOL_GPL(nvme_start_keep_alive); diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 27e31c00b306..0521e4707d1c 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -183,6 +183,7 @@ struct nvme_ctrl { struct work_struct scan_work; struct work_struct async_event_work; struct delayed_work ka_work; + struct nvme_command ka_cmd; struct work_struct fw_act_work; /* Power saving configuration */ From aef17ca1271948ee57cc39b2493d31110cc42625 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Wed, 7 Feb 2018 17:49:39 -0800 Subject: [PATCH 050/149] hwmon: (k10temp) Only apply temperature offset if result is positive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A user reports a really bad temperature on Ryzen 1950X. k10temp-pci-00cb Adapter: PCI adapter temp1: +4294948.3°C (high = +70.0°C) This will happen if the temperature reported by the chip is lower than the offset temperature. This has been seen in the field if "Sense MI Skew" and/or "Sense MI Offset" BIOS parameters were set to unexpected values. Let's report a temperature of 0 degrees C in that case. Fixes: 1b50b776355f ("hwmon: (k10temp) Add support for temperature offsets") Signed-off-by: Guenter Roeck --- drivers/hwmon/k10temp.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/k10temp.c b/drivers/hwmon/k10temp.c index 06b4e1c78bd8..4c6594a4661d 100644 --- a/drivers/hwmon/k10temp.c +++ b/drivers/hwmon/k10temp.c @@ -129,7 +129,10 @@ static ssize_t temp1_input_show(struct device *dev, data->read_tempreg(data->pdev, ®val); temp = (regval >> 21) * 125; - temp -= data->temp_offset; + if (temp > data->temp_offset) + temp -= data->temp_offset; + else + temp = 0; return sprintf(buf, "%u\n", temp); } From 75b0e73023ef7994348d619e9adadab0e96bb195 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 8 Feb 2018 10:24:02 +0000 Subject: [PATCH 051/149] drm/i915/perf: Fix compiler warning for string truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/gpu/drm/i915/i915_oa_cflgt3.c: In function ‘i915_perf_load_test_config_cflgt3’: drivers/gpu/drm/i915/i915_oa_cflgt3.c:87:2: error: ‘strncpy’ output truncated before terminating nul copying 36 bytes from a string of the same length [-Werror=stringop-truncation] v2: strlcpy Fixes: 4407eaa9b0cc ("drm/i915/perf: add support for Coffeelake GT3") Signed-off-by: Chris Wilson Cc: Lionel Landwerlin Cc: Matthew Auld Reviewed-by: Lionel Landwerlin Link: https://patchwork.freedesktop.org/patch/msgid/20180208102403.5587-1-chris@chris-wilson.co.uk (cherry picked from commit 43df81d324cdd7056ad0ce3df709aff8dce856b7) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_oa_cflgt3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_oa_cflgt3.c b/drivers/gpu/drm/i915/i915_oa_cflgt3.c index 42ff06fe54a3..792facdb6702 100644 --- a/drivers/gpu/drm/i915/i915_oa_cflgt3.c +++ b/drivers/gpu/drm/i915/i915_oa_cflgt3.c @@ -84,9 +84,9 @@ show_test_oa_id(struct device *kdev, struct device_attribute *attr, char *buf) void i915_perf_load_test_config_cflgt3(struct drm_i915_private *dev_priv) { - strncpy(dev_priv->perf.oa.test_config.uuid, + strlcpy(dev_priv->perf.oa.test_config.uuid, "577e8e2c-3fa0-4875-8743-3538d585e3b0", - UUID_STRING_LEN); + sizeof(dev_priv->perf.oa.test_config.uuid)); dev_priv->perf.oa.test_config.id = 1; dev_priv->perf.oa.test_config.mux_regs = mux_config_test_oa; From 73b0fcd24ef1b8e20b7f6e6babcde540d96d0cb2 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 8 Feb 2018 10:24:03 +0000 Subject: [PATCH 052/149] drm/i915/perf: Fix compiler warning for string truncation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/gpu/drm/i915/i915_oa_cnl.c: In function ‘i915_perf_load_test_config_cnl’: drivers/gpu/drm/i915/i915_oa_cnl.c:99:2: error: ‘strncpy’ output truncated before terminating nul copying 36 bytes from a string of the same length [-Werror=stringop-truncation] v2: strlcpy Fixes: 95690a02fb5d ("drm/i915/perf: enable perf support on CNL") Signed-off-by: Chris Wilson Cc: Lionel Landwerlin Cc: Matthew Auld Reviewed-by: Lionel Landwerlin Link: https://patchwork.freedesktop.org/patch/msgid/20180208102403.5587-2-chris@chris-wilson.co.uk (cherry picked from commit 020580ff8edd50e64ae1bf47e560c61e5e2f29fc) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_oa_cnl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_oa_cnl.c b/drivers/gpu/drm/i915/i915_oa_cnl.c index ff0ac3627cc4..ba9140c87cc0 100644 --- a/drivers/gpu/drm/i915/i915_oa_cnl.c +++ b/drivers/gpu/drm/i915/i915_oa_cnl.c @@ -96,9 +96,9 @@ show_test_oa_id(struct device *kdev, struct device_attribute *attr, char *buf) void i915_perf_load_test_config_cnl(struct drm_i915_private *dev_priv) { - strncpy(dev_priv->perf.oa.test_config.uuid, + strlcpy(dev_priv->perf.oa.test_config.uuid, "db41edd4-d8e7-4730-ad11-b9a2d6833503", - UUID_STRING_LEN); + sizeof(dev_priv->perf.oa.test_config.uuid)); dev_priv->perf.oa.test_config.id = 1; dev_priv->perf.oa.test_config.mux_regs = mux_config_test_oa; From 33afe065b66f226ee5f90ab24ff55799c896e381 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Thu, 8 Feb 2018 08:51:51 +0000 Subject: [PATCH 053/149] drm/i915: Avoid truncation before clamping userspace's priority value Userspace provides a 64b value for the priority, we need to be careful to preserve the full range before validation to prevent truncation (and letting an illegal value pass). Reported-by: Antonio Argenziano Fixes: ac14fbd460d0 ("drm/i915/scheduler: Support user-defined priorities") Signed-off-by: Chris Wilson Cc: Antonio Argenziano Cc: Michal Winiarski Cc: Mika Kuoppala Cc: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20180208085151.11480-1-chris@chris-wilson.co.uk Reviewed-by: Joonas Lahtinen (cherry picked from commit 11a18f631959fd1ca10856c836a827683536770c) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_gem_context.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_gem_context.c b/drivers/gpu/drm/i915/i915_gem_context.c index 648e7536ff51..0c963fcf31ff 100644 --- a/drivers/gpu/drm/i915/i915_gem_context.c +++ b/drivers/gpu/drm/i915/i915_gem_context.c @@ -803,7 +803,7 @@ int i915_gem_context_setparam_ioctl(struct drm_device *dev, void *data, case I915_CONTEXT_PARAM_PRIORITY: { - int priority = args->value; + s64 priority = args->value; if (args->size) ret = -EINVAL; From 7292b9e6586534fb43e4316ad8b508bf3d1212f7 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 12 Feb 2018 09:39:28 +0000 Subject: [PATCH 054/149] drm/i915: Don't wake the device up to check if the engine is asleep If the entire device is powered off, we can safely assume that the engine is also asleep (and idle). Reported-by: Tvrtko Ursulin Fixes: a091d4ee931b ("drm/i915: Hold a wakeref for probing the ring registers") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Mika Kuoppala Reviewed-by: Mika Kuoppala Link: https://patchwork.freedesktop.org/patch/msgid/20180212093928.6005-1-chris@chris-wilson.co.uk (cherry picked from commit 74d00d28a15c8452f65de0a9477b52d95639cc63) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_engine_cs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index d790bdc227ff..acc661aa9c0c 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -1458,7 +1458,9 @@ static bool ring_is_idle(struct intel_engine_cs *engine) struct drm_i915_private *dev_priv = engine->i915; bool idle = true; - intel_runtime_pm_get(dev_priv); + /* If the whole device is asleep, the engine must be idle */ + if (!intel_runtime_pm_get_if_in_use(dev_priv)) + return true; /* First check that no commands are left in the ring */ if ((I915_READ_HEAD(engine) & HEAD_ADDR) != From 6fe0ce1eb04f99a1eb1eb6e7f775666966cf6c80 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Tue, 6 Feb 2018 09:55:48 +0800 Subject: [PATCH 055/149] sched/deadline: Make update_curr_dl() more accurate rq->clock_task may be updated between the two calls of rq_clock_task() in update_curr_dl(). Calling rq_clock_task() only once makes it more accurate and efficient, taking update_curr() as reference. Suggested-by: Peter Zijlstra Signed-off-by: Wen Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1517882148-44599-1-git-send-email-wen.yang99@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/deadline.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9bb0e0c412ec..9df09782025c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1153,6 +1153,7 @@ static void update_curr_dl(struct rq *rq) struct sched_dl_entity *dl_se = &curr->dl; u64 delta_exec, scaled_delta_exec; int cpu = cpu_of(rq); + u64 now; if (!dl_task(curr) || !on_dl_rq(dl_se)) return; @@ -1165,7 +1166,8 @@ static void update_curr_dl(struct rq *rq) * natural solution, but the full ramifications of this * approach need further study. */ - delta_exec = rq_clock_task(rq) - curr->se.exec_start; + now = rq_clock_task(rq); + delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) { if (unlikely(dl_se->dl_yielded)) goto throttle; @@ -1178,7 +1180,7 @@ static void update_curr_dl(struct rq *rq) curr->se.sum_exec_runtime += delta_exec; account_group_exec_runtime(curr, delta_exec); - curr->se.exec_start = rq_clock_task(rq); + curr->se.exec_start = now; cgroup_account_cputime(curr, delta_exec); sched_rt_avg_update(rq, delta_exec); From a7711602c7b79950ea437178f601b52ab08ef659 Mon Sep 17 00:00:00 2001 From: Wen Yang Date: Tue, 6 Feb 2018 09:53:28 +0800 Subject: [PATCH 056/149] sched/rt: Make update_curr_rt() more accurate rq->clock_task may be updated between the two calls of rq_clock_task() in update_curr_rt(). Calling rq_clock_task() only once makes it more accurate and efficient, taking update_curr() as reference. Signed-off-by: Wen Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Jiang Biao Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: zhong.weidong@zte.com.cn Link: http://lkml.kernel.org/r/1517882008-44552-1-git-send-email-wen.yang99@zte.com.cn Signed-off-by: Ingo Molnar --- kernel/sched/rt.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 663b2355a3aa..aad49451584e 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -950,12 +950,13 @@ static void update_curr_rt(struct rq *rq) { struct task_struct *curr = rq->curr; struct sched_rt_entity *rt_se = &curr->rt; - u64 now = rq_clock_task(rq); u64 delta_exec; + u64 now; if (curr->sched_class != &rt_sched_class) return; + now = rq_clock_task(rq); delta_exec = now - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; From 269d599271fa604f09d5cb0093c5dd5d59964dd5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Feb 2018 17:52:13 +0100 Subject: [PATCH 057/149] sched/core: Fix DEBUG_SPINLOCK annotation for rq->lock Mark noticed that he had sporadic "spinlock recursion" warnings from the DEBUG_SPINLOCK code. Now rq->lock is special in that the owner changes in the middle of a context switch. It so happens that we fix up the lock.owner too late, @prev can run (remotely) the moment prev->on_cpu is cleared, this then allows @prev to again try and acquire this rq->lock and trigger this warning. So we have to switch lock.owner before clearing prev->on_cpu. Do this by moving the DEBUG_SPINLOCK annotation from after switch_to() to before switch_to() and collect all lockdep annotations there into prepare_lock_switch() to mirror the existing finish_lock_switch(). Debugged-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Acked-by: Mark Rutland Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index bf724c1952ea..e7c535eee0a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2601,19 +2601,31 @@ static inline void finish_task(struct task_struct *prev) #endif } -static inline void finish_lock_switch(struct rq *rq) +static inline void +prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ + rq_unpin_lock(rq, rf); + spin_release(&rq->lock.dep_map, 1, _THIS_IP_); #ifdef CONFIG_DEBUG_SPINLOCK /* this is a valid case when another task releases the spinlock */ - rq->lock.owner = current; + rq->lock.owner = next; #endif +} + +static inline void finish_lock_switch(struct rq *rq) +{ /* * If we are tracking spinlock dependencies then we have to * fix up the runqueue lock - which gets 'carried over' from * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); - raw_spin_unlock_irq(&rq->lock); } @@ -2844,14 +2856,7 @@ context_switch(struct rq *rq, struct task_struct *prev, rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - /* - * Since the runqueue lock will be released by the next - * task (which is an invalid locking op but in the case - * of the scheduler it's an obvious special-case), so we - * do an early lockdep release here: - */ - rq_unpin_lock(rq, rf); - spin_release(&rq->lock.dep_map, 1, _THIS_IP_); + prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); From 627f4a2bdf113ab88abc65cb505c89cbf615eae0 Mon Sep 17 00:00:00 2001 From: Jaedon Shin Date: Tue, 6 Feb 2018 12:13:21 +0900 Subject: [PATCH 058/149] MIPS: BMIPS: Fix section mismatch warning Remove the __init annotation from bmips_cpu_setup() to avoid the following warning. WARNING: vmlinux.o(.text+0x35c950): Section mismatch in reference from the function brcmstb_pm_s3() to the function .init.text:bmips_cpu_setup() The function brcmstb_pm_s3() references the function __init bmips_cpu_setup(). This is often because brcmstb_pm_s3 lacks a __init annotation or the annotation of bmips_cpu_setup is wrong. Signed-off-by: Jaedon Shin Cc: Ralf Baechle Cc: Florian Fainelli Cc: Kevin Cernekee Cc: linux-mips@linux-mips.org Reviewed-by: James Hogan Reviewed-by: Florian Fainelli Patchwork: https://patchwork.linux-mips.org/patch/18589/ Signed-off-by: James Hogan --- arch/mips/kernel/smp-bmips.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/smp-bmips.c b/arch/mips/kernel/smp-bmips.c index 87dcac2447c8..9d41732a9146 100644 --- a/arch/mips/kernel/smp-bmips.c +++ b/arch/mips/kernel/smp-bmips.c @@ -572,7 +572,7 @@ asmlinkage void __weak plat_wired_tlb_setup(void) */ } -void __init bmips_cpu_setup(void) +void bmips_cpu_setup(void) { void __iomem __maybe_unused *cbr = BMIPS_GET_CBR(); u32 __maybe_unused cfg; From 43d1b29b27c76e7454cd6c85bec4d0e9cbb039f3 Mon Sep 17 00:00:00 2001 From: Leo Yan Date: Thu, 8 Feb 2018 21:48:22 +0800 Subject: [PATCH 059/149] sched/cpufreq: Remove unused SUGOV_KTHREAD_PRIORITY macro Since schedutil kernel thread directly set priority to 0, the macro SUGOV_KTHREAD_PRIORITY is not used. So remove it. Signed-off-by: Leo Yan Acked-by: Viresh Kumar Acked-by: Daniel Lezcano Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rafael J . Wysocki Cc: Thomas Gleixner Cc: Vikram Mulukutla Cc: Vincent Guittot Link: http://lkml.kernel.org/r/1518097702-9665-1-git-send-email-leo.yan@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index dd062a1c8cf0..7936f548e071 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -19,8 +19,6 @@ #include "sched.h" -#define SUGOV_KTHREAD_PRIORITY 50 - struct sugov_tunables { struct gov_attr_set attr_set; unsigned int rate_limit_us; From 67a3ba25aa955198196f40b76b329b3ab9ad415a Mon Sep 17 00:00:00 2001 From: Marcin Nowakowski Date: Thu, 1 Feb 2018 12:37:21 +0100 Subject: [PATCH 060/149] MIPS: Fix incorrect mem=X@Y handling Commit 73fbc1eba7ff ("MIPS: fix mem=X@Y commandline processing") added a fix to ensure that the memory range between PHYS_OFFSET and low memory address specified by mem= cmdline argument is not later processed by free_all_bootmem. This change was incorrect for systems where the commandline specifies more than 1 mem argument, as it will cause all memory between PHYS_OFFSET and each of the memory offsets to be marked as reserved, which results in parts of the RAM marked as reserved (Creator CI20's u-boot has a default commandline argument 'mem=256M@0x0 mem=768M@0x30000000'). Change the behaviour to ensure that only the range between PHYS_OFFSET and the lowest start address of the memories is marked as protected. This change also ensures that the range is marked protected even if it's only defined through the devicetree and not only via commandline arguments. Reported-by: Mathieu Malaterre Signed-off-by: Marcin Nowakowski Fixes: 73fbc1eba7ff ("MIPS: fix mem=X@Y commandline processing") Cc: Ralf Baechle Cc: linux-mips@linux-mips.org Cc: # v4.11+ Tested-by: Mathieu Malaterre Patchwork: https://patchwork.linux-mips.org/patch/18562/ Signed-off-by: James Hogan --- arch/mips/kernel/setup.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 85bc601e9a0d..5f8b0a9e30b3 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -375,6 +375,7 @@ static void __init bootmem_init(void) unsigned long reserved_end; unsigned long mapstart = ~0UL; unsigned long bootmap_size; + phys_addr_t ramstart = (phys_addr_t)ULLONG_MAX; bool bootmap_valid = false; int i; @@ -395,7 +396,8 @@ static void __init bootmem_init(void) max_low_pfn = 0; /* - * Find the highest page frame number we have available. + * Find the highest page frame number we have available + * and the lowest used RAM address */ for (i = 0; i < boot_mem_map.nr_map; i++) { unsigned long start, end; @@ -407,6 +409,8 @@ static void __init bootmem_init(void) end = PFN_DOWN(boot_mem_map.map[i].addr + boot_mem_map.map[i].size); + ramstart = min(ramstart, boot_mem_map.map[i].addr); + #ifndef CONFIG_HIGHMEM /* * Skip highmem here so we get an accurate max_low_pfn if low @@ -436,6 +440,13 @@ static void __init bootmem_init(void) mapstart = max(reserved_end, start); } + /* + * Reserve any memory between the start of RAM and PHYS_OFFSET + */ + if (ramstart > PHYS_OFFSET) + add_memory_region(PHYS_OFFSET, ramstart - PHYS_OFFSET, + BOOT_MEM_RESERVED); + if (min_low_pfn >= max_low_pfn) panic("Incorrect memory mapping !!!"); if (min_low_pfn > ARCH_PFN_OFFSET) { @@ -664,9 +675,6 @@ static int __init early_parse_mem(char *p) add_memory_region(start, size, BOOT_MEM_RAM); - if (start && start > PHYS_OFFSET) - add_memory_region(PHYS_OFFSET, start - PHYS_OFFSET, - BOOT_MEM_RESERVED); return 0; } early_param("mem", early_parse_mem); From 95bcade33a8af38755c9b0636e36a36ad3789fe6 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Feb 2018 13:22:56 +0000 Subject: [PATCH 061/149] locking/qspinlock: Ensure node is initialised before updating prev->next When a locker ends up queuing on the qspinlock locking slowpath, we initialise the relevant mcs node and publish it indirectly by updating the tail portion of the lock word using xchg_tail. If we find that there was a pre-existing locker in the queue, we subsequently update their ->next field to point at our node so that we are notified when it's our turn to take the lock. This can be roughly illustrated as follows: /* Initialise the fields in node and encode a pointer to node in tail */ tail = initialise_node(node); /* * Exchange tail into the lockword using an atomic read-modify-write * operation with release semantics */ old = xchg_tail(lock, tail); /* If there was a pre-existing waiter ... */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); smp_read_barrier_depends(); /* ... then update their ->next field to point to node. WRITE_ONCE(prev->next, node); } The conditional update of prev->next therefore relies on the address dependency from the result of xchg_tail ensuring order against the prior initialisation of node. However, since the release semantics of the xchg_tail operation apply only to the write portion of the RmW, then this ordering is not guaranteed and it is possible for the CPU to return old before the writes to node have been published, consequently allowing us to point prev->next to an uninitialised node. This patch fixes the problem by making the update of prev->next a RELEASE operation, which also removes the reliance on dependency ordering. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518528177-19169-2-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 38ece035039e..348c8cec1042 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -408,14 +408,15 @@ queue: */ if (old & _Q_TAIL_MASK) { prev = decode_tail(old); - /* - * The above xchg_tail() is also a load of @lock which - * generates, through decode_tail(), a pointer. The address - * dependency matches the RELEASE of xchg_tail() such that - * the subsequent access to @prev happens after. - */ - WRITE_ONCE(prev->next, node); + /* + * We must ensure that the stores to @node are observed before + * the write to prev->next. The address dependency from + * xchg_tail is not sufficient to ensure this because the read + * component of xchg_tail is unordered with respect to the + * initialisation of @node. + */ + smp_store_release(&prev->next, node); pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); From 11dc13224c975efcec96647a4768a6f1bb7a19a8 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Feb 2018 13:22:57 +0000 Subject: [PATCH 062/149] locking/qspinlock: Ensure node->count is updated before initialising node When queuing on the qspinlock, the count field for the current CPU's head node is incremented. This needn't be atomic because locking in e.g. IRQ context is balanced and so an IRQ will return with node->count as it found it. However, the compiler could in theory reorder the initialisation of node[idx] before the increment of the head node->count, causing an IRQ to overwrite the initialised node and potentially corrupt the lock state. Avoid the potential for this harmful compiler reordering by placing a barrier() between the increment of the head node->count and the subsequent node initialisation. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518528177-19169-3-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 348c8cec1042..d880296245c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -379,6 +379,14 @@ queue: tail = encode_tail(smp_processor_id(), idx); node += idx; + + /* + * Ensure that we increment the head node->count before initialising + * the actual node. If the compiler is kind enough to reorder these + * stores, then an IRQ could overwrite our assignments. + */ + barrier(); + node->locked = 0; node->next = NULL; pv_init_node(node); From 61e02392d3c7ecac1f91c0a90a8043d67e081846 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Feb 2018 13:30:19 +0000 Subject: [PATCH 063/149] locking/atomic/bitops: Document and clarify ordering semantics for failed test_and_{}_bit() A test_and_{}_bit() operation fails if the value of the bit is such that the modification does not take place. For example, if test_and_set_bit() returns 1. In these cases, follow the behaviour of cmpxchg and allow the operation to be unordered. This also applies to test_and_set_bit_lock() if the lock is found to be be taken already. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518528619-20049-1-git-send-email-will.deacon@arm.com Signed-off-by: Ingo Molnar --- Documentation/atomic_bitops.txt | 7 ++++++- include/asm-generic/bitops/lock.h | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/atomic_bitops.txt b/Documentation/atomic_bitops.txt index 5550bfdcce5f..be70b32c95d9 100644 --- a/Documentation/atomic_bitops.txt +++ b/Documentation/atomic_bitops.txt @@ -58,7 +58,12 @@ Like with atomic_t, the rule of thumb is: - RMW operations that have a return value are fully ordered. -Except for test_and_set_bit_lock() which has ACQUIRE semantics and + - RMW operations that are conditional are unordered on FAILURE, + otherwise the above rules apply. In the case of test_and_{}_bit() operations, + if the bit in memory is unchanged by the operation then it is deemed to have + failed. + +Except for a successful test_and_set_bit_lock() which has ACQUIRE semantics and clear_bit_unlock() which has RELEASE semantics. Since a platform only has a single means of achieving atomic operations diff --git a/include/asm-generic/bitops/lock.h b/include/asm-generic/bitops/lock.h index bc397573c43a..67ab280ad134 100644 --- a/include/asm-generic/bitops/lock.h +++ b/include/asm-generic/bitops/lock.h @@ -7,7 +7,8 @@ * @nr: Bit to set * @addr: Address to count from * - * This operation is atomic and provides acquire barrier semantics. + * This operation is atomic and provides acquire barrier semantics if + * the returned value is 0. * It can be used to implement bit locks. */ #define test_and_set_bit_lock(nr, addr) test_and_set_bit(nr, addr) From 2dd6fd2e999774041397f2a7da2e1d30b3a27c3a Mon Sep 17 00:00:00 2001 From: Tycho Andersen Date: Thu, 1 Feb 2018 12:41:19 +0100 Subject: [PATCH 064/149] locking/semaphore: Update the file path in documentation While reading this header I noticed that the locking stuff has moved to kernel/locking/*, so update the path in semaphore.h to point to that. Signed-off-by: Tycho Andersen Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20180201114119.1090-1-tycho@tycho.ws Signed-off-by: Ingo Molnar --- include/linux/semaphore.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/semaphore.h b/include/linux/semaphore.h index dc368b8ce215..11c86fbfeb98 100644 --- a/include/linux/semaphore.h +++ b/include/linux/semaphore.h @@ -4,7 +4,7 @@ * * Distributed under the terms of the GNU GPL, version 2 * - * Please see kernel/semaphore.c for documentation of these functions + * Please see kernel/locking/semaphore.c for documentation of these functions */ #ifndef __LINUX_SEMAPHORE_H #define __LINUX_SEMAPHORE_H From 67b4110f8c8d16e588d7730db8e8b01b32c1bd8b Mon Sep 17 00:00:00 2001 From: Nitesh Shetty Date: Tue, 13 Feb 2018 21:18:12 +0530 Subject: [PATCH 065/149] blk: optimization for classic polling This removes the dependency on interrupts to wake up task. Set task state as TASK_RUNNING, if need_resched() returns true, while polling for IO completion. Earlier, polling task used to sleep, relying on interrupt to wake it up. This made some IO take very long when interrupt-coalescing is enabled in NVMe. Reference: http://lists.infradead.org/pipermail/linux-nvme/2018-February/015435.html Changes since v2->v3: -using __set_current_state() instead of set_current_state() Changes since v1->v2: -setting task state once in blk_poll, instead of multiple callers. Signed-off-by: Nitesh Shetty Signed-off-by: Jens Axboe --- block/blk-mq.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/blk-mq.c b/block/blk-mq.c index df93102e2149..357492712b0e 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3164,6 +3164,7 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) cpu_relax(); } + __set_current_state(TASK_RUNNING); return false; } From 7bcfab202ca71bece02b283cdd104301c07eece4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 13 Feb 2018 07:44:34 -0800 Subject: [PATCH 066/149] powerpc/macio: set a proper dma_coherent_mask We have expected busses to set up a coherent mask to properly use the common dma mapping code for a long time, and now that I've added a warning macio turned out to not set one up yet. This sets it to the same value as the dma_mask, which seems to be what the drivers expect. Reported-by: Mathieu Malaterre Tested-by: Mathieu Malaterre Reported-by: Meelis Roos Tested-by: Meelis Roos Signed-off-by: Christoph Hellwig --- drivers/macintosh/macio_asic.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/macintosh/macio_asic.c b/drivers/macintosh/macio_asic.c index 62f541f968f6..07074820a167 100644 --- a/drivers/macintosh/macio_asic.c +++ b/drivers/macintosh/macio_asic.c @@ -375,6 +375,7 @@ static struct macio_dev * macio_add_one_device(struct macio_chip *chip, dev->ofdev.dev.of_node = np; dev->ofdev.archdata.dma_mask = 0xffffffffUL; dev->ofdev.dev.dma_mask = &dev->ofdev.archdata.dma_mask; + dev->ofdev.dev.coherent_dma_mask = dev->ofdev.archdata.dma_mask; dev->ofdev.dev.parent = parent; dev->ofdev.dev.bus = &macio_bus_type; dev->ofdev.dev.release = macio_release_dev; From 3fd176b754e992e1cdf1693ea8184626d1ed7671 Mon Sep 17 00:00:00 2001 From: Jianchao Wang Date: Mon, 12 Feb 2018 20:54:45 +0800 Subject: [PATCH 067/149] nvme: fix the deadlock in nvme_update_formats nvme_update_formats will invoke nvme_ns_remove under namespaces_mutext. The will cause deadlock because nvme_ns_remove will also require the namespaces_mutext. Fix it by getting the ns entries which should be removed under namespaces_mutext and invoke nvme_ns_remove out of namespaces_mutext. Signed-off-by: Jianchao Wang Reviewed-by: Christoph Hellwig Reviewed-by: Keith Busch Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 6d0490b477c9..52b3626fb64e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -1117,14 +1117,19 @@ static u32 nvme_passthru_start(struct nvme_ctrl *ctrl, struct nvme_ns *ns, static void nvme_update_formats(struct nvme_ctrl *ctrl) { - struct nvme_ns *ns; + struct nvme_ns *ns, *next; + LIST_HEAD(rm_list); mutex_lock(&ctrl->namespaces_mutex); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->disk && nvme_revalidate_disk(ns->disk)) - nvme_ns_remove(ns); + if (ns->disk && nvme_revalidate_disk(ns->disk)) { + list_move_tail(&ns->list, &rm_list); + } } mutex_unlock(&ctrl->namespaces_mutex); + + list_for_each_entry_safe(ns, next, &rm_list, list) + nvme_ns_remove(ns); } static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects) From 815c6704bf9f1c59f3a6be380a4032b9c57b12f1 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 13 Feb 2018 05:44:44 -0700 Subject: [PATCH 068/149] nvme-pci: Remap CMB SQ entries on every controller reset The controller memory buffer is remapped into a kernel address on each reset, but the driver was setting the submission queue base address only on the very first queue creation. The remapped address is likely to change after a reset, so accessing the old address will hit a kernel bug. This patch fixes that by setting the queue's CMB base address each time the queue is created. Fixes: f63572dff1421 ("nvme: unmap CMB and remove sysfs file in reset path") Reported-by: Christian Black Cc: Jon Derrick Cc: # 4.9+ Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index ab9c19525fa8..b427157af74e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1364,18 +1364,14 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { - if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), - dev->ctrl.page_size); - nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; - nvmeq->sq_cmds_io = dev->cmb + offset; - } else { - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), - &nvmeq->sq_dma_addr, GFP_KERNEL); - if (!nvmeq->sq_cmds) - return -ENOMEM; - } + /* CMB SQEs will be mapped before creation */ + if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) + return 0; + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + &nvmeq->sq_dma_addr, GFP_KERNEL); + if (!nvmeq->sq_cmds) + return -ENOMEM; return 0; } @@ -1449,6 +1445,13 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) struct nvme_dev *dev = nvmeq->dev; int result; + if (dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { + unsigned offset = (qid - 1) * roundup(SQ_SIZE(nvmeq->q_depth), + dev->ctrl.page_size); + nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; + nvmeq->sq_cmds_io = dev->cmb + offset; + } + nvmeq->cq_vector = qid - 1; result = adapter_alloc_cq(dev, qid, nvmeq); if (result < 0) From 4244140d7b8f406b7edfd01c050dea783aa1efc5 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 8 Feb 2018 08:55:34 -0700 Subject: [PATCH 069/149] nvme-pci: Fix timeouts in connecting state We need to halt the controller immediately if we haven't completed initialization as indicated by the new "connecting" state. Fixes: ad70062cdb ("nvme-pci: introduce RECONNECTING state to mark initializing procedure") Signed-off-by: Keith Busch Reviewed-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b427157af74e..73036d2fbbd5 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1215,13 +1215,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) * cancellation error. All outstanding requests are completed on * shutdown, so we return BLK_EH_HANDLED. */ - if (dev->ctrl.state == NVME_CTRL_RESETTING) { + switch (dev->ctrl.state) { + case NVME_CTRL_CONNECTING: + case NVME_CTRL_RESETTING: dev_warn(dev->ctrl.device, "I/O %d QID %d timeout, disable controller\n", req->tag, nvmeq->qid); nvme_dev_disable(dev, false); nvme_req(req)->flags |= NVME_REQ_CANCELLED; return BLK_EH_HANDLED; + default: + break; } /* From 117172c8f9d40ba1de8cb35c6e614422faa03330 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 13 Feb 2018 09:01:54 +0000 Subject: [PATCH 070/149] drm/i915/breadcrumbs: Ignore unsubmitted signalers When a request is preempted, it is unsubmitted from the HW queue and removed from the active list of breadcrumbs. In the process, this however triggers the signaler and it may see the clear rbtree with the old, and still valid, seqno, or it may match the cleared seqno with the now zero rq->global_seqno. This confuses the signaler into action and signaling the fence. Fixes: d6a2289d9d6b ("drm/i915: Remove the preempted request from the execution queue") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Cc: Joonas Lahtinen Cc: # v4.12+ Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20180206094633.30181-1-chris@chris-wilson.co.uk (cherry picked from commit fd10e2ce9905030d922e179a8047a4d50daffd8e) Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180213090154.17373-1-chris@chris-wilson.co.uk --- drivers/gpu/drm/i915/intel_breadcrumbs.c | 29 ++++++++---------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_breadcrumbs.c b/drivers/gpu/drm/i915/intel_breadcrumbs.c index bd40fea16b4f..f54ddda9fdad 100644 --- a/drivers/gpu/drm/i915/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/intel_breadcrumbs.c @@ -594,29 +594,16 @@ void intel_engine_remove_wait(struct intel_engine_cs *engine, spin_unlock_irq(&b->rb_lock); } -static bool signal_valid(const struct drm_i915_gem_request *request) -{ - return intel_wait_check_request(&request->signaling.wait, request); -} - static bool signal_complete(const struct drm_i915_gem_request *request) { if (!request) return false; - /* If another process served as the bottom-half it may have already - * signalled that this wait is already completed. - */ - if (intel_wait_complete(&request->signaling.wait)) - return signal_valid(request); - - /* Carefully check if the request is complete, giving time for the + /* + * Carefully check if the request is complete, giving time for the * seqno to be visible or if the GPU hung. */ - if (__i915_request_irq_complete(request)) - return true; - - return false; + return __i915_request_irq_complete(request); } static struct drm_i915_gem_request *to_signaler(struct rb_node *rb) @@ -659,9 +646,13 @@ static int intel_breadcrumbs_signaler(void *arg) request = i915_gem_request_get_rcu(request); rcu_read_unlock(); if (signal_complete(request)) { - local_bh_disable(); - dma_fence_signal(&request->fence); - local_bh_enable(); /* kick start the tasklets */ + if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, + &request->fence.flags)) { + local_bh_disable(); + dma_fence_signal(&request->fence); + GEM_BUG_ON(!i915_gem_request_completed(request)); + local_bh_enable(); /* kick start the tasklets */ + } spin_lock_irq(&b->rb_lock); From edb76b01ac1629bfe17158bea56fcc16bfb57854 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 13 Feb 2018 09:57:44 +0000 Subject: [PATCH 071/149] drm/i915: Lock out execlist tasklet while peeking inside for busy-stats In order to prevent a race condition where we may end up overaccounting the active state and leaving the busy-stats believing the GPU is 100% busy, lock out the tasklet while we reconstruct the busy state. There is no direct spinlock guard for the execlists->port[], so we need to utilise tasklet_disable() as a synchronous barrier to prevent it, the only writer to execlists->port[], from running at the same time as the enable. Fixes: 4900727d35bb ("drm/i915/pmu: Reconstruct active state on starting busy-stats") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20180115092041.13509-1-chris@chris-wilson.co.uk Reviewed-by: Tvrtko Ursulin (cherry picked from commit 99e48bf98dd036090b480a12c39e8b971731247e) Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-1-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/intel_engine_cs.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/intel_engine_cs.c b/drivers/gpu/drm/i915/intel_engine_cs.c index acc661aa9c0c..fa960cfd2764 100644 --- a/drivers/gpu/drm/i915/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/intel_engine_cs.c @@ -1945,16 +1945,22 @@ intel_engine_lookup_user(struct drm_i915_private *i915, u8 class, u8 instance) */ int intel_enable_engine_stats(struct intel_engine_cs *engine) { + struct intel_engine_execlists *execlists = &engine->execlists; unsigned long flags; + int err = 0; if (!intel_engine_supports_stats(engine)) return -ENODEV; + tasklet_disable(&execlists->tasklet); spin_lock_irqsave(&engine->stats.lock, flags); - if (engine->stats.enabled == ~0) - goto busy; + + if (unlikely(engine->stats.enabled == ~0)) { + err = -EBUSY; + goto unlock; + } + if (engine->stats.enabled++ == 0) { - struct intel_engine_execlists *execlists = &engine->execlists; const struct execlist_port *port = execlists->port; unsigned int num_ports = execlists_num_ports(execlists); @@ -1969,14 +1975,12 @@ int intel_enable_engine_stats(struct intel_engine_cs *engine) if (engine->stats.active) engine->stats.start = engine->stats.enabled_at; } + +unlock: spin_unlock_irqrestore(&engine->stats.lock, flags); + tasklet_enable(&execlists->tasklet); - return 0; - -busy: - spin_unlock_irqrestore(&engine->stats.lock, flags); - - return -EBUSY; + return err; } static ktime_t __intel_engine_get_busy_time(struct intel_engine_cs *engine) From d3f84c8b097001e3f31f584b793493cb0033a7ae Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 13 Feb 2018 09:57:45 +0000 Subject: [PATCH 072/149] drm/i915/pmu: Fix PMU enable vs execlists tasklet race Commit 99e48bf98dd0 ("drm/i915: Lock out execlist tasklet while peeking inside for busy-stats") added a tasklet_disable call in busy stats enabling, but we failed to understand that the PMU enable callback runs as an hard IRQ (IPI). Consequence of this is that the PMU enable callback can interrupt the execlists tasklet, and will then deadlock when it calls intel_engine_stats_enable->tasklet_disable. To fix this, I realized it is possible to move the engine stats enablement and disablement to PMU event init and destroy hooks. This allows for much simpler implementation since those hooks run in normal context (can sleep). v2: Extract engine_event_destroy. (Chris Wilson) Signed-off-by: Tvrtko Ursulin Fixes: 99e48bf98dd0 ("drm/i915: Lock out execlist tasklet while peeking inside for busy-stats") Testcase: igt/perf_pmu/enable-race-* Cc: Chris Wilson Cc: Tvrtko Ursulin Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: intel-gfx@lists.freedesktop.org Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180205093448.13877-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit b2f78cda260bc6a1a2d382b1d85a29e69b5b3724) Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-2-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 127 ++++++++++-------------- drivers/gpu/drm/i915/intel_ringbuffer.h | 14 --- 2 files changed, 53 insertions(+), 88 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 55a8a1e29424..337eaa6ede52 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -285,26 +285,41 @@ static u64 count_interrupts(struct drm_i915_private *i915) return sum; } -static void i915_pmu_event_destroy(struct perf_event *event) -{ - WARN_ON(event->parent); -} - -static int engine_event_init(struct perf_event *event) +static void engine_event_destroy(struct perf_event *event) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), pmu.base); + struct intel_engine_cs *engine; - if (!intel_engine_lookup_user(i915, engine_event_class(event), - engine_event_instance(event))) - return -ENODEV; + engine = intel_engine_lookup_user(i915, + engine_event_class(event), + engine_event_instance(event)); + if (WARN_ON_ONCE(!engine)) + return; - switch (engine_event_sample(event)) { + if (engine_event_sample(event) == I915_SAMPLE_BUSY && + intel_engine_supports_stats(engine)) + intel_disable_engine_stats(engine); +} + +static void i915_pmu_event_destroy(struct perf_event *event) +{ + WARN_ON(event->parent); + + if (is_engine_event(event)) + engine_event_destroy(event); +} + +static int +engine_event_status(struct intel_engine_cs *engine, + enum drm_i915_pmu_engine_sample sample) +{ + switch (sample) { case I915_SAMPLE_BUSY: case I915_SAMPLE_WAIT: break; case I915_SAMPLE_SEMA: - if (INTEL_GEN(i915) < 6) + if (INTEL_GEN(engine->i915) < 6) return -ENODEV; break; default: @@ -314,6 +329,30 @@ static int engine_event_init(struct perf_event *event) return 0; } +static int engine_event_init(struct perf_event *event) +{ + struct drm_i915_private *i915 = + container_of(event->pmu, typeof(*i915), pmu.base); + struct intel_engine_cs *engine; + u8 sample; + int ret; + + engine = intel_engine_lookup_user(i915, engine_event_class(event), + engine_event_instance(event)); + if (!engine) + return -ENODEV; + + sample = engine_event_sample(event); + ret = engine_event_status(engine, sample); + if (ret) + return ret; + + if (sample == I915_SAMPLE_BUSY && intel_engine_supports_stats(engine)) + ret = intel_enable_engine_stats(engine); + + return ret; +} + static int i915_pmu_event_init(struct perf_event *event) { struct drm_i915_private *i915 = @@ -387,7 +426,7 @@ static u64 __i915_pmu_event_read(struct perf_event *event) if (WARN_ON_ONCE(!engine)) { /* Do nothing */ } else if (sample == I915_SAMPLE_BUSY && - engine->pmu.busy_stats) { + intel_engine_supports_stats(engine)) { val = ktime_to_ns(intel_engine_get_busy_time(engine)); } else { val = engine->pmu.sample[sample].cur; @@ -442,12 +481,6 @@ again: local64_add(new - prev, &event->count); } -static bool engine_needs_busy_stats(struct intel_engine_cs *engine) -{ - return intel_engine_supports_stats(engine) && - (engine->pmu.enable & BIT(I915_SAMPLE_BUSY)); -} - static void i915_pmu_enable(struct perf_event *event) { struct drm_i915_private *i915 = @@ -487,21 +520,7 @@ static void i915_pmu_enable(struct perf_event *event) GEM_BUG_ON(sample >= I915_PMU_SAMPLE_BITS); GEM_BUG_ON(engine->pmu.enable_count[sample] == ~0); - if (engine->pmu.enable_count[sample]++ == 0) { - /* - * Enable engine busy stats tracking if needed or - * alternatively cancel the scheduled disable. - * - * If the delayed disable was pending, cancel it and - * in this case do not enable since it already is. - */ - if (engine_needs_busy_stats(engine) && - !engine->pmu.busy_stats) { - engine->pmu.busy_stats = true; - if (!cancel_delayed_work(&engine->pmu.disable_busy_stats)) - intel_enable_engine_stats(engine); - } - } + engine->pmu.enable_count[sample]++; } /* @@ -514,14 +533,6 @@ static void i915_pmu_enable(struct perf_event *event) spin_unlock_irqrestore(&i915->pmu.lock, flags); } -static void __disable_busy_stats(struct work_struct *work) -{ - struct intel_engine_cs *engine = - container_of(work, typeof(*engine), pmu.disable_busy_stats.work); - - intel_disable_engine_stats(engine); -} - static void i915_pmu_disable(struct perf_event *event) { struct drm_i915_private *i915 = @@ -545,26 +556,8 @@ static void i915_pmu_disable(struct perf_event *event) * Decrement the reference count and clear the enabled * bitmask when the last listener on an event goes away. */ - if (--engine->pmu.enable_count[sample] == 0) { + if (--engine->pmu.enable_count[sample] == 0) engine->pmu.enable &= ~BIT(sample); - if (!engine_needs_busy_stats(engine) && - engine->pmu.busy_stats) { - engine->pmu.busy_stats = false; - /* - * We request a delayed disable to handle the - * rapid on/off cycles on events, which can - * happen when tools like perf stat start, in a - * nicer way. - * - * In addition, this also helps with busy stats - * accuracy with background CPU offline/online - * migration events. - */ - queue_delayed_work(system_wq, - &engine->pmu.disable_busy_stats, - round_jiffies_up_relative(HZ)); - } - } } GEM_BUG_ON(bit >= I915_PMU_MASK_BITS); @@ -797,8 +790,6 @@ static void i915_pmu_unregister_cpuhp_state(struct drm_i915_private *i915) void i915_pmu_register(struct drm_i915_private *i915) { - struct intel_engine_cs *engine; - enum intel_engine_id id; int ret; if (INTEL_GEN(i915) <= 2) { @@ -820,10 +811,6 @@ void i915_pmu_register(struct drm_i915_private *i915) hrtimer_init(&i915->pmu.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); i915->pmu.timer.function = i915_sample; - for_each_engine(engine, i915, id) - INIT_DELAYED_WORK(&engine->pmu.disable_busy_stats, - __disable_busy_stats); - ret = perf_pmu_register(&i915->pmu.base, "i915", -1); if (ret) goto err; @@ -843,9 +830,6 @@ err: void i915_pmu_unregister(struct drm_i915_private *i915) { - struct intel_engine_cs *engine; - enum intel_engine_id id; - if (!i915->pmu.base.event_init) return; @@ -853,11 +837,6 @@ void i915_pmu_unregister(struct drm_i915_private *i915) hrtimer_cancel(&i915->pmu.timer); - for_each_engine(engine, i915, id) { - GEM_BUG_ON(engine->pmu.busy_stats); - flush_delayed_work(&engine->pmu.disable_busy_stats); - } - i915_pmu_unregister_cpuhp_state(i915); perf_pmu_unregister(&i915->pmu.base); diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h index c5ff203e42d6..a0e7a6c2a57c 100644 --- a/drivers/gpu/drm/i915/intel_ringbuffer.h +++ b/drivers/gpu/drm/i915/intel_ringbuffer.h @@ -366,20 +366,6 @@ struct intel_engine_cs { */ #define I915_ENGINE_SAMPLE_MAX (I915_SAMPLE_SEMA + 1) struct i915_pmu_sample sample[I915_ENGINE_SAMPLE_MAX]; - /** - * @busy_stats: Has enablement of engine stats tracking been - * requested. - */ - bool busy_stats; - /** - * @disable_busy_stats: Work item for busy stats disabling. - * - * Same as with @enable_busy_stats action, with the difference - * that we delay it in case there are rapid enable-disable - * actions, which can happen during tool startup (like perf - * stat). - */ - struct delayed_work disable_busy_stats; } pmu; /* From 4c83f0a788ccf58864f781585d8ae7c7e6a7e07d Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Tue, 13 Feb 2018 09:57:46 +0000 Subject: [PATCH 073/149] drm/i915/pmu: Fix sleep under atomic in RC6 readout We are not allowed to call intel_runtime_pm_get from the PMU counter read callback since the former can sleep, and the latter is running under IRQ context. To workaround this, we record the last known RC6 and while runtime suspended estimate its increase by querying the runtime PM core timestamps. Downside of this approach is that we can temporarily lose a chunk of RC6 time, from the last PMU read-out to runtime suspend entry, but that will eventually catch up, once device comes back online and in the presence of PMU queries. Also, we have to be careful not to overshoot the RC6 estimate, so once resumed after a period of approximation, we only update the counter once it catches up. With the observation that RC6 is increasing while the device is suspended, this should not pose a problem and can only cause slight inaccuracies due clock base differences. v2: Simplify by estimating on top of PM core counters. (Imre) Signed-off-by: Tvrtko Ursulin Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=104943 Fixes: 6060b6aec03c ("drm/i915/pmu: Add RC6 residency metrics") Testcase: igt/perf_pmu/rc6-runtime-pm Cc: Tvrtko Ursulin Cc: Chris Wilson Cc: Imre Deak Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: David Airlie Cc: intel-gfx@lists.freedesktop.org Cc: dri-devel@lists.freedesktop.org Reviewed-by: Chris Wilson Link: https://patchwork.freedesktop.org/patch/msgid/20180206183311.17924-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit 1fe699e30113ed6f6e853ff44710d256072ea627) Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-3-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 93 +++++++++++++++++++++++++++------ drivers/gpu/drm/i915/i915_pmu.h | 6 +++ 2 files changed, 84 insertions(+), 15 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index 337eaa6ede52..e13859aaa2a3 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -409,7 +409,81 @@ static int i915_pmu_event_init(struct perf_event *event) return 0; } -static u64 __i915_pmu_event_read(struct perf_event *event) +static u64 get_rc6(struct drm_i915_private *i915, bool locked) +{ + unsigned long flags; + u64 val; + + if (intel_runtime_pm_get_if_in_use(i915)) { + val = intel_rc6_residency_ns(i915, IS_VALLEYVIEW(i915) ? + VLV_GT_RENDER_RC6 : + GEN6_GT_GFX_RC6); + + if (HAS_RC6p(i915)) + val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p); + + if (HAS_RC6pp(i915)) + val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp); + + intel_runtime_pm_put(i915); + + /* + * If we are coming back from being runtime suspended we must + * be careful not to report a larger value than returned + * previously. + */ + + if (!locked) + spin_lock_irqsave(&i915->pmu.lock, flags); + + if (val >= i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur) { + i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = 0; + i915->pmu.sample[__I915_SAMPLE_RC6].cur = val; + } else { + val = i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur; + } + + if (!locked) + spin_unlock_irqrestore(&i915->pmu.lock, flags); + } else { + struct pci_dev *pdev = i915->drm.pdev; + struct device *kdev = &pdev->dev; + unsigned long flags2; + + /* + * We are runtime suspended. + * + * Report the delta from when the device was suspended to now, + * on top of the last known real value, as the approximated RC6 + * counter value. + */ + if (!locked) + spin_lock_irqsave(&i915->pmu.lock, flags); + + spin_lock_irqsave(&kdev->power.lock, flags2); + + if (!i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur) + i915->pmu.suspended_jiffies_last = + kdev->power.suspended_jiffies; + + val = kdev->power.suspended_jiffies - + i915->pmu.suspended_jiffies_last; + val += jiffies - kdev->power.accounting_timestamp; + + spin_unlock_irqrestore(&kdev->power.lock, flags2); + + val = jiffies_to_nsecs(val); + val += i915->pmu.sample[__I915_SAMPLE_RC6].cur; + i915->pmu.sample[__I915_SAMPLE_RC6_ESTIMATED].cur = val; + + if (!locked) + spin_unlock_irqrestore(&i915->pmu.lock, flags); + } + + return val; +} + +static u64 __i915_pmu_event_read(struct perf_event *event, bool locked) { struct drm_i915_private *i915 = container_of(event->pmu, typeof(*i915), pmu.base); @@ -447,18 +521,7 @@ static u64 __i915_pmu_event_read(struct perf_event *event) val = count_interrupts(i915); break; case I915_PMU_RC6_RESIDENCY: - intel_runtime_pm_get(i915); - val = intel_rc6_residency_ns(i915, - IS_VALLEYVIEW(i915) ? - VLV_GT_RENDER_RC6 : - GEN6_GT_GFX_RC6); - if (HAS_RC6p(i915)) - val += intel_rc6_residency_ns(i915, - GEN6_GT_GFX_RC6p); - if (HAS_RC6pp(i915)) - val += intel_rc6_residency_ns(i915, - GEN6_GT_GFX_RC6pp); - intel_runtime_pm_put(i915); + val = get_rc6(i915, locked); break; } } @@ -473,7 +536,7 @@ static void i915_pmu_event_read(struct perf_event *event) again: prev = local64_read(&hwc->prev_count); - new = __i915_pmu_event_read(event); + new = __i915_pmu_event_read(event, false); if (local64_cmpxchg(&hwc->prev_count, prev, new) != prev) goto again; @@ -528,7 +591,7 @@ static void i915_pmu_enable(struct perf_event *event) * for all listeners. Even when the event was already enabled and has * an existing non-zero value. */ - local64_set(&event->hw.prev_count, __i915_pmu_event_read(event)); + local64_set(&event->hw.prev_count, __i915_pmu_event_read(event, true)); spin_unlock_irqrestore(&i915->pmu.lock, flags); } diff --git a/drivers/gpu/drm/i915/i915_pmu.h b/drivers/gpu/drm/i915/i915_pmu.h index 40c154d13565..bb62df15afa4 100644 --- a/drivers/gpu/drm/i915/i915_pmu.h +++ b/drivers/gpu/drm/i915/i915_pmu.h @@ -27,6 +27,8 @@ enum { __I915_SAMPLE_FREQ_ACT = 0, __I915_SAMPLE_FREQ_REQ, + __I915_SAMPLE_RC6, + __I915_SAMPLE_RC6_ESTIMATED, __I915_NUM_PMU_SAMPLERS }; @@ -94,6 +96,10 @@ struct i915_pmu { * struct intel_engine_cs. */ struct i915_pmu_sample sample[__I915_NUM_PMU_SAMPLERS]; + /** + * @suspended_jiffies_last: Cached suspend time from PM core. + */ + unsigned long suspended_jiffies_last; }; #ifdef CONFIG_PERF_EVENTS From 4b8b41d15d9db54703958fbd2928a2fd319563f6 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Tue, 13 Feb 2018 09:57:47 +0000 Subject: [PATCH 074/149] drm/i915/pmu: Fix building without CONFIG_PM As we peek inside struct device to query members guarded by CONFIG_PM, so must be the code. Reported-by: kbuild test robot Fixes: 1fe699e30113 ("drm/i915/pmu: Fix sleep under atomic in RC6 readout") Signed-off-by: Chris Wilson Cc: Tvrtko Ursulin Reviewed-by: Tvrtko Ursulin Link: https://patchwork.freedesktop.org/patch/msgid/20180207160428.17015-1-chris@chris-wilson.co.uk (cherry picked from commit 05273c950a3c93c5f96be8807eaf24f2cc9f1c1e) Signed-off-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20180213095747.2424-4-tvrtko.ursulin@linux.intel.com --- drivers/gpu/drm/i915/i915_pmu.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_pmu.c b/drivers/gpu/drm/i915/i915_pmu.c index e13859aaa2a3..0e9b98c32b62 100644 --- a/drivers/gpu/drm/i915/i915_pmu.c +++ b/drivers/gpu/drm/i915/i915_pmu.c @@ -409,22 +409,32 @@ static int i915_pmu_event_init(struct perf_event *event) return 0; } +static u64 __get_rc6(struct drm_i915_private *i915) +{ + u64 val; + + val = intel_rc6_residency_ns(i915, + IS_VALLEYVIEW(i915) ? + VLV_GT_RENDER_RC6 : + GEN6_GT_GFX_RC6); + + if (HAS_RC6p(i915)) + val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p); + + if (HAS_RC6pp(i915)) + val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp); + + return val; +} + static u64 get_rc6(struct drm_i915_private *i915, bool locked) { +#if IS_ENABLED(CONFIG_PM) unsigned long flags; u64 val; if (intel_runtime_pm_get_if_in_use(i915)) { - val = intel_rc6_residency_ns(i915, IS_VALLEYVIEW(i915) ? - VLV_GT_RENDER_RC6 : - GEN6_GT_GFX_RC6); - - if (HAS_RC6p(i915)) - val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6p); - - if (HAS_RC6pp(i915)) - val += intel_rc6_residency_ns(i915, GEN6_GT_GFX_RC6pp); - + val = __get_rc6(i915); intel_runtime_pm_put(i915); /* @@ -481,6 +491,9 @@ static u64 get_rc6(struct drm_i915_private *i915, bool locked) } return val; +#else + return __get_rc6(i915); +#endif } static u64 __i915_pmu_event_read(struct perf_event *event, bool locked) From 37ad4e68783088ed61493f54194cfccd3c87ab35 Mon Sep 17 00:00:00 2001 From: Weinan Li Date: Fri, 9 Feb 2018 16:01:34 +0800 Subject: [PATCH 075/149] drm/i915/gvt: add 0xe4f0 into gen9 render list Guest may set this register on KBL platform, it can impact hardware behavior, so add it into the gen9 render list. Otherwise gpu hang issue may happen during different vgpu switch. v2: separate it from patch set. Cc: Zhi Wang Cc: Zhenyu Wang Signed-off-by: Weinan Li Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/mmio_context.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gvt/mmio_context.c b/drivers/gpu/drm/i915/gvt/mmio_context.c index 73ad6e90e49d..256f1bb522b7 100644 --- a/drivers/gpu/drm/i915/gvt/mmio_context.c +++ b/drivers/gpu/drm/i915/gvt/mmio_context.c @@ -118,6 +118,7 @@ static struct engine_mmio gen9_engine_mmio_list[] __cacheline_aligned = { {RCS, HALF_SLICE_CHICKEN3, 0xffff, true}, /* 0xe184 */ {RCS, GEN9_HALF_SLICE_CHICKEN5, 0xffff, true}, /* 0xe188 */ {RCS, GEN9_HALF_SLICE_CHICKEN7, 0xffff, true}, /* 0xe194 */ + {RCS, GEN8_ROW_CHICKEN, 0xffff, true}, /* 0xe4f0 */ {RCS, TRVATTL3PTRDW(0), 0, false}, /* 0x4de0 */ {RCS, TRVATTL3PTRDW(1), 0, false}, /* 0x4de4 */ {RCS, TRNULLDETCT, 0, false}, /* 0x4de8 */ From a26ca6ad4c4aa4afcbfe4c46c33ad98859736245 Mon Sep 17 00:00:00 2001 From: Tina Zhang Date: Sun, 11 Feb 2018 14:59:19 +0800 Subject: [PATCH 076/149] drm/i915/gvt: Support BAR0 8-byte reads/writes GGTT is in BAR0 with 8 bytes aligned. With a qemu patch (commit: 38d49e8c1523d97d2191190d3f7b4ce7a0ab5aa3), VFIO can use 8-byte reads/ writes to access it. This patch is to support the 8-byte GGTT reads/writes. Ideally, we would like to support 8-byte reads/writes for the total BAR0. But it needs more work for handling 8-byte MMIO reads/writes. This patch can fix the issue caused by partial updating GGTT entry, during guest booting up. v3: - Use intel_vgpu_get_bar_gpa() stead. (Zhenyu) - Include all the GGTT checking logic in gtt_entry(). (Zhenyu) v2: - Limit to GGTT entry. (Zhenyu) Signed-off-by: Tina Zhang Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/kvmgt.c | 51 ++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/kvmgt.c b/drivers/gpu/drm/i915/gvt/kvmgt.c index 909499b73d03..021f722e2481 100644 --- a/drivers/gpu/drm/i915/gvt/kvmgt.c +++ b/drivers/gpu/drm/i915/gvt/kvmgt.c @@ -733,6 +733,25 @@ static ssize_t intel_vgpu_rw(struct mdev_device *mdev, char *buf, return ret == 0 ? count : ret; } +static bool gtt_entry(struct mdev_device *mdev, loff_t *ppos) +{ + struct intel_vgpu *vgpu = mdev_get_drvdata(mdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + struct intel_gvt *gvt = vgpu->gvt; + int offset; + + /* Only allow MMIO GGTT entry access */ + if (index != PCI_BASE_ADDRESS_0) + return false; + + offset = (u64)(*ppos & VFIO_PCI_OFFSET_MASK) - + intel_vgpu_get_bar_gpa(vgpu, PCI_BASE_ADDRESS_0); + + return (offset >= gvt->device_info.gtt_start_offset && + offset < gvt->device_info.gtt_start_offset + gvt_ggtt_sz(gvt)) ? + true : false; +} + static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf, size_t count, loff_t *ppos) { @@ -742,7 +761,21 @@ static ssize_t intel_vgpu_read(struct mdev_device *mdev, char __user *buf, while (count) { size_t filled; - if (count >= 4 && !(*ppos % 4)) { + /* Only support GGTT entry 8 bytes read */ + if (count >= 8 && !(*ppos % 8) && + gtt_entry(mdev, ppos)) { + u64 val; + + ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), + ppos, false); + if (ret <= 0) + goto read_err; + + if (copy_to_user(buf, &val, sizeof(val))) + goto read_err; + + filled = 8; + } else if (count >= 4 && !(*ppos % 4)) { u32 val; ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), @@ -802,7 +835,21 @@ static ssize_t intel_vgpu_write(struct mdev_device *mdev, while (count) { size_t filled; - if (count >= 4 && !(*ppos % 4)) { + /* Only support GGTT entry 8 bytes write */ + if (count >= 8 && !(*ppos % 8) && + gtt_entry(mdev, ppos)) { + u64 val; + + if (copy_from_user(&val, buf, sizeof(val))) + goto write_err; + + ret = intel_vgpu_rw(mdev, (char *)&val, sizeof(val), + ppos, true); + if (ret <= 0) + goto write_err; + + filled = 8; + } else if (count >= 4 && !(*ppos % 4)) { u32 val; if (copy_from_user(&val, buf, sizeof(val))) From 3cc7644e4af179e79153b1fd60f9dd937ee32684 Mon Sep 17 00:00:00 2001 From: Weinan Li Date: Mon, 12 Feb 2018 15:28:42 +0800 Subject: [PATCH 077/149] drm/i915/gvt: fix one typo of render_mmio trace Fix one typo of render_mmio trace, exchange the mmio value of old and new. Signed-off-by: Weinan Li Signed-off-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gvt/trace.h b/drivers/gpu/drm/i915/gvt/trace.h index 7a2511538f34..736bd2bc5127 100644 --- a/drivers/gpu/drm/i915/gvt/trace.h +++ b/drivers/gpu/drm/i915/gvt/trace.h @@ -333,7 +333,7 @@ TRACE_EVENT(render_mmio, TP_PROTO(int old_id, int new_id, char *action, unsigned int reg, unsigned int old_val, unsigned int new_val), - TP_ARGS(old_id, new_id, action, reg, new_val, old_val), + TP_ARGS(old_id, new_id, action, reg, old_val, new_val), TP_STRUCT__entry( __field(int, old_id) From d15d662e89fc667b90cd294b0eb45694e33144da Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 12 Feb 2018 15:20:51 +0100 Subject: [PATCH 078/149] ALSA: seq: Fix racy pool initializations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ALSA sequencer core initializes the event pool on demand by invoking snd_seq_pool_init() when the first write happens and the pool is empty. Meanwhile user can reset the pool size manually via ioctl concurrently, and this may lead to UAF or out-of-bound accesses since the function tries to vmalloc / vfree the buffer. A simple fix is to just wrap the snd_seq_pool_init() call with the recently introduced client->ioctl_mutex; as the calls for snd_seq_pool_init() from other side are always protected with this mutex, we can avoid the race. Reported-by: 范龙飞 Cc: Signed-off-by: Takashi Iwai --- sound/core/seq/seq_clientmgr.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 60db32785f62..04d4db44fae5 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1003,7 +1003,7 @@ static ssize_t snd_seq_write(struct file *file, const char __user *buf, { struct snd_seq_client *client = file->private_data; int written = 0, len; - int err = -EINVAL; + int err; struct snd_seq_event event; if (!(snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT)) @@ -1018,11 +1018,15 @@ static ssize_t snd_seq_write(struct file *file, const char __user *buf, /* allocate the pool now if the pool is not allocated yet */ if (client->pool->size > 0 && !snd_seq_write_pool_allocated(client)) { - if (snd_seq_pool_init(client->pool) < 0) + mutex_lock(&client->ioctl_mutex); + err = snd_seq_pool_init(client->pool); + mutex_unlock(&client->ioctl_mutex); + if (err < 0) return -ENOMEM; } /* only process whole events */ + err = -EINVAL; while (count >= sizeof(struct snd_seq_event)) { /* Read in the event header from the user */ len = sizeof(event); From fe0e58048f005fdce315eb4d185e5c160be4ac01 Mon Sep 17 00:00:00 2001 From: Jerome Brunet Date: Mon, 12 Feb 2018 14:13:59 +0100 Subject: [PATCH 079/149] Revert "mmc: meson-gx: include tx phase in the tuning process" This reverts commit 0a44697627d17a66d7dc98f17aeca07ca79c5c20. This commit was initially intended to fix problems with hs200 and hs400 on some boards, mainly the odroid-c2. The OC2 (Rev 0.2) I have performs well in this modes, so I could not confirm these issues. We've had several reports about the issues being still present on (some) OC2, so apparently, this change does not do what it was supposed to do. Maybe the eMMC signal quality is on the edge on the board. This may explain the variability we see in term of stability, but this is just a guess. Lowering the max_frequency to 100Mhz seems to do trick for those affected by the issue Worse, the commit created new issues (CRC errors and hangs) on other boards, such as the kvim 1 and 2, the p200 or the libretech-cc. According to amlogic, the Tx phase should not be tuned and left in its default configuration, so it is best to just revert the commit. Fixes: 0a44697627d1 ("mmc: meson-gx: include tx phase in the tuning process") Cc: # 4.14+ Signed-off-by: Jerome Brunet Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 22438ebfe4e6..4f972b879fe6 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -717,22 +717,6 @@ static int meson_mmc_clk_phase_tuning(struct mmc_host *mmc, u32 opcode, static int meson_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) { struct meson_host *host = mmc_priv(mmc); - int ret; - - /* - * If this is the initial tuning, try to get a sane Rx starting - * phase before doing the actual tuning. - */ - if (!mmc->doing_retune) { - ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk); - - if (ret) - return ret; - } - - ret = meson_mmc_clk_phase_tuning(mmc, opcode, host->tx_clk); - if (ret) - return ret; return meson_mmc_clk_phase_tuning(mmc, opcode, host->rx_clk); } @@ -763,9 +747,8 @@ static void meson_mmc_set_ios(struct mmc_host *mmc, struct mmc_ios *ios) if (!IS_ERR(mmc->supply.vmmc)) mmc_regulator_set_ocr(mmc, mmc->supply.vmmc, ios->vdd); - /* Reset phases */ + /* Reset rx phase */ clk_set_phase(host->rx_clk, 0); - clk_set_phase(host->tx_clk, 270); break; From 118032be389009b07ecb5a03ffe219a89d421def Mon Sep 17 00:00:00 2001 From: Phil Elwell Date: Mon, 12 Feb 2018 21:13:44 +0100 Subject: [PATCH 080/149] mmc: bcm2835: Don't overwrite max frequency unconditionally The optional DT parameter max-frequency could init the max bus frequency. So take care of this, before setting the max bus frequency. Fixes: 660fc733bd74 ("mmc: bcm2835: Add new driver for the sdhost controller.") Signed-off-by: Phil Elwell Signed-off-by: Stefan Wahren Cc: # 4.12+ Signed-off-by: Ulf Hansson --- drivers/mmc/host/bcm2835.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index 229dc18f0581..768972af8b85 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1265,7 +1265,8 @@ static int bcm2835_add_host(struct bcm2835_host *host) char pio_limit_string[20]; int ret; - mmc->f_max = host->max_clk; + if (!mmc->f_max || mmc->f_max > host->max_clk) + mmc->f_max = host->max_clk; mmc->f_min = host->max_clk / SDCDIV_MAX_CDIV; mmc->max_busy_timeout = ~0 / (mmc->f_max / 1000); From fdcc968a3b290407bcba9d4c90e2fba6d8d928f1 Mon Sep 17 00:00:00 2001 From: Jan-Marek Glogowski Date: Wed, 14 Feb 2018 11:29:15 +0100 Subject: [PATCH 081/149] ALSA: hda/realtek: PCI quirk for Fujitsu U7x7 These laptops have a combined jack to attach headsets, the U727 on the left, the U757 on the right, but a headsets microphone doesn't work. Using hdajacksensetest I found that pin 0x19 changed the present state when plugging the headset, in addition to 0x21, but didn't have the correct configuration (shown as "Not connected"). So this sets the configuration to the same values as the headphone pin 0x21 except for the device type microphone, which makes it work correctly. With the patch the configured pins for U727 are Pin 0x12 (Internal Mic, Mobile-In): present = No Pin 0x14 (Internal Speaker): present = No Pin 0x19 (Black Mic, Left side): present = No Pin 0x1d (Internal Aux): present = No Pin 0x21 (Black Headphone, Left side): present = No Signed-off-by: Jan-Marek Glogowski Cc: Signed-off-by: Takashi Iwai --- sound/pci/hda/patch_realtek.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 32938ca8e5e3..ce28f7ce64e6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -3465,6 +3465,19 @@ static void alc269_fixup_pincfg_no_hp_to_lineout(struct hda_codec *codec, spec->parse_flags = HDA_PINCFG_NO_HP_FIXUP; } +static void alc269_fixup_pincfg_U7x7_headset_mic(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + unsigned int cfg_headphone = snd_hda_codec_get_pincfg(codec, 0x21); + unsigned int cfg_headset_mic = snd_hda_codec_get_pincfg(codec, 0x19); + + if (cfg_headphone && cfg_headset_mic == 0x411111f0) + snd_hda_codec_set_pincfg(codec, 0x19, + (cfg_headphone & ~AC_DEFCFG_DEVICE) | + (AC_JACK_MIC_IN << AC_DEFCFG_DEVICE_SHIFT)); +} + static void alc269_fixup_hweq(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -5373,6 +5386,7 @@ enum { ALC269_FIXUP_LIFEBOOK_EXTMIC, ALC269_FIXUP_LIFEBOOK_HP_PIN, ALC269_FIXUP_LIFEBOOK_NO_HP_TO_LINEOUT, + ALC255_FIXUP_LIFEBOOK_U7x7_HEADSET_MIC, ALC269_FIXUP_AMIC, ALC269_FIXUP_DMIC, ALC269VB_FIXUP_AMIC, @@ -5579,6 +5593,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc269_fixup_pincfg_no_hp_to_lineout, }, + [ALC255_FIXUP_LIFEBOOK_U7x7_HEADSET_MIC] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc269_fixup_pincfg_U7x7_headset_mic, + }, [ALC269_FIXUP_AMIC] = { .type = HDA_FIXUP_PINS, .v.pins = (const struct hda_pintbl[]) { @@ -6453,6 +6471,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x10cf, 0x159f, "Lifebook E780", ALC269_FIXUP_LIFEBOOK_NO_HP_TO_LINEOUT), SND_PCI_QUIRK(0x10cf, 0x15dc, "Lifebook T731", ALC269_FIXUP_LIFEBOOK_HP_PIN), SND_PCI_QUIRK(0x10cf, 0x1757, "Lifebook E752", ALC269_FIXUP_LIFEBOOK_HP_PIN), + SND_PCI_QUIRK(0x10cf, 0x1629, "Lifebook U7x7", ALC255_FIXUP_LIFEBOOK_U7x7_HEADSET_MIC), SND_PCI_QUIRK(0x10cf, 0x1845, "Lifebook U904", ALC269_FIXUP_LIFEBOOK_EXTMIC), SND_PCI_QUIRK(0x10ec, 0x10f2, "Intel Reference board", ALC700_FIXUP_INTEL_REFERENCE), SND_PCI_QUIRK(0x144d, 0xc109, "Samsung Ativ book 9 (NP900X3G)", ALC269_FIXUP_INV_DMIC), From fa08a3b4eba59429cf7e241a7af089103e79160f Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Mon, 18 Dec 2017 17:21:23 +0100 Subject: [PATCH 082/149] virtio/s390: implement PM operations for virtio_ccw Suspend/Resume to/from disk currently fails. Let us wire up the necessary callbacks. This is mostly just forwarding the requests to the virtio drivers. The only thing that has to be done in virtio_ccw itself is to re-set the virtio revision. Suggested-by: Thomas Huth Signed-off-by: Christian Borntraeger Message-Id: <20171207141102.70190-2-borntraeger@de.ibm.com> Reviewed-by: David Hildenbrand [CH: merged <20171218083706.223836-1-borntraeger@de.ibm.com> to fix !CONFIG_PM configs] Signed-off-by: Cornelia Huck Signed-off-by: Michael S. Tsirkin --- drivers/s390/virtio/virtio_ccw.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index ba2e0856d22c..8f5c1d7f751a 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -1297,6 +1297,9 @@ static int virtio_ccw_cio_notify(struct ccw_device *cdev, int event) vcdev->device_lost = true; rc = NOTIFY_DONE; break; + case CIO_OPER: + rc = NOTIFY_OK; + break; default: rc = NOTIFY_DONE; break; @@ -1309,6 +1312,27 @@ static struct ccw_device_id virtio_ids[] = { {}, }; +#ifdef CONFIG_PM_SLEEP +static int virtio_ccw_freeze(struct ccw_device *cdev) +{ + struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev); + + return virtio_device_freeze(&vcdev->vdev); +} + +static int virtio_ccw_restore(struct ccw_device *cdev) +{ + struct virtio_ccw_device *vcdev = dev_get_drvdata(&cdev->dev); + int ret; + + ret = virtio_ccw_set_transport_rev(vcdev); + if (ret) + return ret; + + return virtio_device_restore(&vcdev->vdev); +} +#endif + static struct ccw_driver virtio_ccw_driver = { .driver = { .owner = THIS_MODULE, @@ -1321,6 +1345,11 @@ static struct ccw_driver virtio_ccw_driver = { .set_online = virtio_ccw_online, .notify = virtio_ccw_cio_notify, .int_class = IRQIO_VIR, +#ifdef CONFIG_PM_SLEEP + .freeze = virtio_ccw_freeze, + .thaw = virtio_ccw_restore, + .restore = virtio_ccw_restore, +#endif }; static int __init pure_hex(char **cp, unsigned int *val, int min_digit, From 7756f72ccd4359c6df61fc431cd3b5b0a8639837 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Tue, 30 Jan 2018 10:07:01 +0000 Subject: [PATCH 083/149] nvmet: Change return code of discard command if not supported Execute discard command on block device that doesn't support it should return success. Returning internal error while using multi-path fails the path. Reviewed-by: Max Gurtovoy Signed-off-by: Israel Rukshin Signed-off-by: Sagi Grimberg --- drivers/nvme/target/io-cmd.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 0a4372a016f2..28bbdff4a88b 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -105,10 +105,13 @@ static void nvmet_execute_flush(struct nvmet_req *req) static u16 nvmet_discard_range(struct nvmet_ns *ns, struct nvme_dsm_range *range, struct bio **bio) { - if (__blkdev_issue_discard(ns->bdev, + int ret; + + ret = __blkdev_issue_discard(ns->bdev, le64_to_cpu(range->slba) << (ns->blksize_shift - 9), le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), - GFP_KERNEL, 0, bio)) + GFP_KERNEL, 0, bio); + if (ret && ret != -EOPNOTSUPP) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; } From 8000d1fdb07e365e6565c2415aefdfed15413794 Mon Sep 17 00:00:00 2001 From: Nitzan Carmi Date: Wed, 17 Jan 2018 11:01:14 +0000 Subject: [PATCH 084/149] nvme-rdma: fix sysfs invoked reset_ctrl error flow When reset_controller that is invoked by sysfs fails, it enters an error flow which practically removes the nvme ctrl entirely (similar to delete_ctrl flow). It causes the system to hang, since a sysfs attribute cannot be unregistered by one of its own methods. This can be fixed by calling delete_ctrl as a work rather than sequential code. In addition, it should give the ctrl a chance to recover using reconnection mechanism (consistant with FC reset_ctrl error flow). Also, while we're here, return suitable errno in case the reset ended with non live ctrl. Signed-off-by: Nitzan Carmi Reviewed-by: Max Gurtovoy Signed-off-by: Sagi Grimberg --- drivers/nvme/host/core.c | 6 +++++- drivers/nvme/host/rdma.c | 7 ++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 52b3626fb64e..0fe7ea35c221 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -120,8 +120,12 @@ int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) int ret; ret = nvme_reset_ctrl(ctrl); - if (!ret) + if (!ret) { flush_work(&ctrl->reset_work); + if (ctrl->state != NVME_CTRL_LIVE) + ret = -ENETRESET; + } + return ret; } EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 5e2cc4f0d207..3a51ed50eff2 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1784,11 +1784,8 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work) return; out_fail: - dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); - nvme_remove_namespaces(&ctrl->ctrl); - nvme_rdma_shutdown_ctrl(ctrl, true); - nvme_uninit_ctrl(&ctrl->ctrl); - nvme_put_ctrl(&ctrl->ctrl); + ++ctrl->ctrl.nr_reconnects; + nvme_rdma_reconnect_or_remove(ctrl); } static const struct nvme_ctrl_ops nvme_rdma_ctrl_ops = { From 2ce77f6d8a9ae9ce6d80397d88bdceb84a2004cd Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 13 Feb 2018 13:14:09 +0000 Subject: [PATCH 085/149] arm64: proc: Set PTE_NG for table entries to avoid traversing them twice When KASAN is enabled, the swapper page table contains many identical mappings of the zero page, which can lead to a stall during boot whilst the G -> nG code continually walks the same page table entries looking for global mappings. This patch sets the nG bit (bit 11, which is IGNORED) in table entries after processing the subtree so we can easily skip them if we see them a second time. Tested-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/mm/proc.S | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 71baed7e592a..c0af47617299 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -205,7 +205,8 @@ ENDPROC(idmap_cpu_replace_ttbr1) dc cvac, cur_\()\type\()p // Ensure any existing dirty dmb sy // lines are written back before ldr \type, [cur_\()\type\()p] // loading the entry - tbz \type, #0, next_\()\type // Skip invalid entries + tbz \type, #0, skip_\()\type // Skip invalid and + tbnz \type, #11, skip_\()\type // non-global entries .endm .macro __idmap_kpti_put_pgtable_ent_ng, type @@ -265,8 +266,9 @@ ENTRY(idmap_kpti_install_ng_mappings) add end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8) do_pgd: __idmap_kpti_get_pgtable_ent pgd tbnz pgd, #1, walk_puds - __idmap_kpti_put_pgtable_ent_ng pgd next_pgd: + __idmap_kpti_put_pgtable_ent_ng pgd +skip_pgd: add cur_pgdp, cur_pgdp, #8 cmp cur_pgdp, end_pgdp b.ne do_pgd @@ -294,8 +296,9 @@ walk_puds: add end_pudp, cur_pudp, #(PTRS_PER_PUD * 8) do_pud: __idmap_kpti_get_pgtable_ent pud tbnz pud, #1, walk_pmds - __idmap_kpti_put_pgtable_ent_ng pud next_pud: + __idmap_kpti_put_pgtable_ent_ng pud +skip_pud: add cur_pudp, cur_pudp, 8 cmp cur_pudp, end_pudp b.ne do_pud @@ -314,8 +317,9 @@ walk_pmds: add end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8) do_pmd: __idmap_kpti_get_pgtable_ent pmd tbnz pmd, #1, walk_ptes - __idmap_kpti_put_pgtable_ent_ng pmd next_pmd: + __idmap_kpti_put_pgtable_ent_ng pmd +skip_pmd: add cur_pmdp, cur_pmdp, #8 cmp cur_pmdp, end_pmdp b.ne do_pmd @@ -333,7 +337,7 @@ walk_ptes: add end_ptep, cur_ptep, #(PTRS_PER_PTE * 8) do_pte: __idmap_kpti_get_pgtable_ent pte __idmap_kpti_put_pgtable_ent_ng pte -next_pte: +skip_pte: add cur_ptep, cur_ptep, #8 cmp cur_ptep, end_ptep b.ne do_pte From 405cacc947f7b58969b2a8ab1568c2d98b245308 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 20 Dec 2017 11:50:17 +0100 Subject: [PATCH 086/149] drm/i915/vlv: Add cdclk workaround for DSI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit At least on the Chuwi Vi8 (non pro/plus) the LCD panel will show an image shifted aprox. 20% to the left (with wraparound) and sometimes also wrong colors, showing that the panel controller is starting with sampling the datastream somewhere mid-line. This happens after the first blanking and re-init of the panel. After looking at drm.debug output I noticed that initially we inherit the cdclk of 333333 KHz set by the GOP, but after the re-init we picked 266667 KHz, which turns out to be the cause of this problem, a quick hack to hard code the cdclk to 333333 KHz makes the problem go away. I've tested this on various Bay Trail devices, to make sure this not does cause regressions on other devices and the higher cdclk does not cause any problems on the following devices: -GP-electronic T701 1024x600 333333 KHz cdclk after this patch -PEAQ C1010 1920x1200 333333 KHz cdclk after this patch -PoV mobii-wintab-800w 800x1280 333333 KHz cdclk after this patch -Asus Transformer-T100TA 1368x768 320000 KHz cdclk after this patch Also interesting wrt this is the comment in vlv_calc_cdclk about the existing workaround to avoid 200 Mhz as clock because that causes issues in some cases. This commit extends the "do not use 200 Mhz" workaround with an extra check to require atleast 320000 KHz (avoiding 266667 KHz) when a DSI panel is active. Changes in v2: -Change the commit message and the code comment to not treat the GOP as a reference, the GOP should not be treated as a reference Acked-by: Ville Syrjälä Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20171220105017.11259-1-hdegoede@redhat.com (cherry picked from commit c8dae55a8ced625038d52d26e48273707fab2688) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_cdclk.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_cdclk.c b/drivers/gpu/drm/i915/intel_cdclk.c index 5dc118f26b51..1704c8897afd 100644 --- a/drivers/gpu/drm/i915/intel_cdclk.c +++ b/drivers/gpu/drm/i915/intel_cdclk.c @@ -1952,6 +1952,14 @@ int intel_crtc_compute_min_cdclk(const struct intel_crtc_state *crtc_state) if (crtc_state->has_audio && INTEL_GEN(dev_priv) >= 9) min_cdclk = max(2 * 96000, min_cdclk); + /* + * On Valleyview some DSI panels lose (v|h)sync when the clock is lower + * than 320000KHz. + */ + if (intel_crtc_has_type(crtc_state, INTEL_OUTPUT_DSI) && + IS_VALLEYVIEW(dev_priv)) + min_cdclk = max(320000, min_cdclk); + if (min_cdclk > dev_priv->max_cdclk_freq) { DRM_DEBUG_KMS("required cdclk (%d kHz) exceeds max (%d kHz)\n", min_cdclk, dev_priv->max_cdclk_freq); From 7928e9bb09dc7f108a1a2b589ef1c7b86843569c Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Feb 2018 09:21:49 +0100 Subject: [PATCH 087/149] drm/i915: Add intel_bios_cleanup() function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add an intel_bios_cleanup() function to act as counterpart of intel_bios_init() and move the cleanup of vbt related resources there, putting it in the same file as the allocation. Changed in v2: -While touching the code anyways, remove the unnecessary: if (dev_priv->vbt.child_dev) done before kfree(dev_priv->vbt.child_dev) Reviewed-by: Ville Syrjälä Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20180214082151.25015-1-hdegoede@redhat.com (cherry picked from commit 785f076b3ba781804f2b22b347b4431e3efb0ab3) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_drv.c | 14 +------------- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/intel_bios.c | 15 +++++++++++++++ 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 173d0095e3b2..2f5209de0391 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1433,19 +1433,7 @@ void i915_driver_unload(struct drm_device *dev) intel_modeset_cleanup(dev); - /* - * free the memory space allocated for the child device - * config parsed from VBT - */ - if (dev_priv->vbt.child_dev && dev_priv->vbt.child_dev_num) { - kfree(dev_priv->vbt.child_dev); - dev_priv->vbt.child_dev = NULL; - dev_priv->vbt.child_dev_num = 0; - } - kfree(dev_priv->vbt.sdvo_lvds_vbt_mode); - dev_priv->vbt.sdvo_lvds_vbt_mode = NULL; - kfree(dev_priv->vbt.lfp_lvds_vbt_mode); - dev_priv->vbt.lfp_lvds_vbt_mode = NULL; + intel_bios_cleanup(dev_priv); vga_switcheroo_unregister_client(pdev); vga_client_register(pdev, NULL, NULL, NULL); diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index a42deebedb0f..d2fc519bc592 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -3657,6 +3657,7 @@ extern void intel_i2c_reset(struct drm_i915_private *dev_priv); /* intel_bios.c */ void intel_bios_init(struct drm_i915_private *dev_priv); +void intel_bios_cleanup(struct drm_i915_private *dev_priv); bool intel_bios_is_valid_vbt(const void *buf, size_t size); bool intel_bios_is_tv_present(struct drm_i915_private *dev_priv); bool intel_bios_is_lvds_present(struct drm_i915_private *dev_priv, u8 *i2c_pin); diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index f7f771749e48..57db816f962b 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1588,6 +1588,21 @@ out: pci_unmap_rom(pdev, bios); } +/** + * intel_bios_cleanup - Free any resources allocated by intel_bios_init() + * @dev_priv: i915 device instance + */ +void intel_bios_cleanup(struct drm_i915_private *dev_priv) +{ + kfree(dev_priv->vbt.child_dev); + dev_priv->vbt.child_dev = NULL; + dev_priv->vbt.child_dev_num = 0; + kfree(dev_priv->vbt.sdvo_lvds_vbt_mode); + dev_priv->vbt.sdvo_lvds_vbt_mode = NULL; + kfree(dev_priv->vbt.lfp_lvds_vbt_mode); + dev_priv->vbt.lfp_lvds_vbt_mode = NULL; +} + /** * intel_bios_is_tv_present - is integrated TV present in VBT * @dev_priv: i915 device instance From ed0545a7fbb5241a27f45a084dd71522cdaea5b9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Feb 2018 09:21:50 +0100 Subject: [PATCH 088/149] drm/i915: Free memdup-ed DSI VBT data structures on driver_unload MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make intel_bios_cleanup function free the DSI VBT data structures which are memdup-ed by parse_mipi_config() and parse_mipi_sequence(). Reviewed-by: Ville Syrjälä Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20180214082151.25015-2-hdegoede@redhat.com (cherry picked from commit e1b86c85f6c2029c31dba99823b6f3d9e15eaacd) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/intel_bios.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 57db816f962b..9a9b62c93889 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -1601,6 +1601,12 @@ void intel_bios_cleanup(struct drm_i915_private *dev_priv) dev_priv->vbt.sdvo_lvds_vbt_mode = NULL; kfree(dev_priv->vbt.lfp_lvds_vbt_mode); dev_priv->vbt.lfp_lvds_vbt_mode = NULL; + kfree(dev_priv->vbt.dsi.data); + dev_priv->vbt.dsi.data = NULL; + kfree(dev_priv->vbt.dsi.pps); + dev_priv->vbt.dsi.pps = NULL; + kfree(dev_priv->vbt.dsi.config); + dev_priv->vbt.dsi.config = NULL; } /** From ee622fe757f6de612dad0f01805eea815a5b3025 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Feb 2018 09:21:51 +0100 Subject: [PATCH 089/149] drm/i915: Fix DSI panels with v1 MIPI sequences without a DEASSERT sequence v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit So far models of the Dell Venue 8 Pro, with a panel with MIPI panel index = 3, one of which has been kindly provided to me by Jan Brummer, where not working with the i915 driver, giving a black screen on the first modeset. The problem with at least these Dells is that their VBT defines a MIPI ASSERT sequence, but not a DEASSERT sequence. Instead they DEASSERT the reset in their INIT_OTP sequence, but the deassert must be done before calling intel_dsi_device_ready(), so that is too late. Simply doing the INIT_OTP sequence earlier is not enough to fix this, because the INIT_OTP sequence also sends various MIPI packets to the panel, which can only happen after calling intel_dsi_device_ready(). This commit fixes this by splitting the INIT_OTP sequence into everything before the first DSI packet and everything else, including the first DSI packet. The first part (everything before the first DSI packet) is then used as deassert sequence. Changed in v2: -Split the init OTP sequence into a deassert reset and the actual init OTP sequence, instead of calling it earlier and then having the first mipi_exec_send_packet() call call intel_dsi_device_ready(). Changes in v3: -Move the whole shebang to intel_bios.c Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=82880 References: https://bugs.freedesktop.org/show_bug.cgi?id=101205 Cc: Jan-Michael Brummer Reported-by: Jan-Michael Brummer Tested-by: Hans de Goede Reviewed-by: Ville Syrjälä Acked-by: Jani Nikula Signed-off-by: Hans de Goede Link: https://patchwork.freedesktop.org/patch/msgid/20180214082151.25015-3-hdegoede@redhat.com (cherry picked from commit fb38e7ade9af4f3e96f5916c3f6f19bfc7d5f961) Signed-off-by: Rodrigo Vivi --- drivers/gpu/drm/i915/i915_drv.h | 1 + drivers/gpu/drm/i915/intel_bios.c | 84 +++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index d2fc519bc592..d307429a5ae0 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -1349,6 +1349,7 @@ struct intel_vbt_data { u32 size; u8 *data; const u8 *sequence[MIPI_SEQ_MAX]; + u8 *deassert_seq; /* Used by fixup_mipi_sequences() */ } dsi; int crt_ddc_pin; diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index 9a9b62c93889..b49a2df44430 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -947,6 +947,86 @@ static int goto_next_sequence_v3(const u8 *data, int index, int total) return 0; } +/* + * Get len of pre-fixed deassert fragment from a v1 init OTP sequence, + * skip all delay + gpio operands and stop at the first DSI packet op. + */ +static int get_init_otp_deassert_fragment_len(struct drm_i915_private *dev_priv) +{ + const u8 *data = dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP]; + int index, len; + + if (WARN_ON(!data || dev_priv->vbt.dsi.seq_version != 1)) + return 0; + + /* index = 1 to skip sequence byte */ + for (index = 1; data[index] != MIPI_SEQ_ELEM_END; index += len) { + switch (data[index]) { + case MIPI_SEQ_ELEM_SEND_PKT: + return index == 1 ? 0 : index; + case MIPI_SEQ_ELEM_DELAY: + len = 5; /* 1 byte for operand + uint32 */ + break; + case MIPI_SEQ_ELEM_GPIO: + len = 3; /* 1 byte for op, 1 for gpio_nr, 1 for value */ + break; + default: + return 0; + } + } + + return 0; +} + +/* + * Some v1 VBT MIPI sequences do the deassert in the init OTP sequence. + * The deassert must be done before calling intel_dsi_device_ready, so for + * these devices we split the init OTP sequence into a deassert sequence and + * the actual init OTP part. + */ +static void fixup_mipi_sequences(struct drm_i915_private *dev_priv) +{ + u8 *init_otp; + int len; + + /* Limit this to VLV for now. */ + if (!IS_VALLEYVIEW(dev_priv)) + return; + + /* Limit this to v1 vid-mode sequences */ + if (dev_priv->vbt.dsi.config->is_cmd_mode || + dev_priv->vbt.dsi.seq_version != 1) + return; + + /* Only do this if there are otp and assert seqs and no deassert seq */ + if (!dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] || + !dev_priv->vbt.dsi.sequence[MIPI_SEQ_ASSERT_RESET] || + dev_priv->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET]) + return; + + /* The deassert-sequence ends at the first DSI packet */ + len = get_init_otp_deassert_fragment_len(dev_priv); + if (!len) + return; + + DRM_DEBUG_KMS("Using init OTP fragment to deassert reset\n"); + + /* Copy the fragment, update seq byte and terminate it */ + init_otp = (u8 *)dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP]; + dev_priv->vbt.dsi.deassert_seq = kmemdup(init_otp, len + 1, GFP_KERNEL); + if (!dev_priv->vbt.dsi.deassert_seq) + return; + dev_priv->vbt.dsi.deassert_seq[0] = MIPI_SEQ_DEASSERT_RESET; + dev_priv->vbt.dsi.deassert_seq[len] = MIPI_SEQ_ELEM_END; + /* Use the copy for deassert */ + dev_priv->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET] = + dev_priv->vbt.dsi.deassert_seq; + /* Replace the last byte of the fragment with init OTP seq byte */ + init_otp[len - 1] = MIPI_SEQ_INIT_OTP; + /* And make MIPI_MIPI_SEQ_INIT_OTP point to it */ + dev_priv->vbt.dsi.sequence[MIPI_SEQ_INIT_OTP] = init_otp + len - 1; +} + static void parse_mipi_sequence(struct drm_i915_private *dev_priv, const struct bdb_header *bdb) @@ -1016,6 +1096,8 @@ parse_mipi_sequence(struct drm_i915_private *dev_priv, dev_priv->vbt.dsi.size = seq_size; dev_priv->vbt.dsi.seq_version = sequence->version; + fixup_mipi_sequences(dev_priv); + DRM_DEBUG_DRIVER("MIPI related VBT parsing complete\n"); return; @@ -1607,6 +1689,8 @@ void intel_bios_cleanup(struct drm_i915_private *dev_priv) dev_priv->vbt.dsi.pps = NULL; kfree(dev_priv->vbt.dsi.config); dev_priv->vbt.dsi.config = NULL; + kfree(dev_priv->vbt.dsi.deassert_seq); + dev_priv->vbt.dsi.deassert_seq = NULL; } /** From c134f0d57a47b7f8704dee1cefc246f9471f3e80 Mon Sep 17 00:00:00 2001 From: Cyril Bur Date: Wed, 14 Feb 2018 14:27:06 +1100 Subject: [PATCH 090/149] powerpc: Expose TSCR via sysfs only on powernv The TSCR can only be accessed in hypervisor mode. Fixes: 88b5e12eeb11 ("powerpc: Expose TSCR via sysfs") Signed-off-by: Cyril Bur Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/sysfs.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index 5a8bfee6e187..04d0bbd7a1dd 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -788,7 +788,8 @@ static int register_cpu_online(unsigned int cpu) if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2)) device_create_file(s, &dev_attr_pir); - if (cpu_has_feature(CPU_FTR_ARCH_206)) + if (cpu_has_feature(CPU_FTR_ARCH_206) && + !firmware_has_feature(FW_FEATURE_LPAR)) device_create_file(s, &dev_attr_tscr); #endif /* CONFIG_PPC64 */ @@ -873,7 +874,8 @@ static int unregister_cpu_online(unsigned int cpu) if (cpu_has_feature(CPU_FTR_PPCAS_ARCH_V2)) device_remove_file(s, &dev_attr_pir); - if (cpu_has_feature(CPU_FTR_ARCH_206)) + if (cpu_has_feature(CPU_FTR_ARCH_206) && + !firmware_has_feature(FW_FEATURE_LPAR)) device_remove_file(s, &dev_attr_tscr); #endif /* CONFIG_PPC64 */ From 8e036c8d30a2cd9d8fc7442fbf6824e0a3e986e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9dric=20Le=20Goater?= Date: Tue, 13 Feb 2018 09:47:12 +0100 Subject: [PATCH 091/149] powerpc/xive: Use hw CPU ids when configuring the CPU queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CPU event notification queues on sPAPR should be configured using a hardware CPU identifier. The problem did not show up on the Power Hypervisor because pHyp supports 8 threads per core which keeps CPU number contiguous. This is not the case on all sPAPR virtual machines, some use SMT=1. Also improve error logging by adding the CPU number. Fixes: eac1e731b59e ("powerpc/xive: guest exploitation of the XIVE interrupt controller") Cc: stable@vger.kernel.org # v4.14+ Signed-off-by: Cédric Le Goater Signed-off-by: Michael Ellerman --- arch/powerpc/sysdev/xive/spapr.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/sysdev/xive/spapr.c b/arch/powerpc/sysdev/xive/spapr.c index d9c4c9366049..091f1d0d0af1 100644 --- a/arch/powerpc/sysdev/xive/spapr.c +++ b/arch/powerpc/sysdev/xive/spapr.c @@ -356,7 +356,8 @@ static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio, rc = plpar_int_get_queue_info(0, target, prio, &esn_page, &esn_size); if (rc) { - pr_err("Error %lld getting queue info prio %d\n", rc, prio); + pr_err("Error %lld getting queue info CPU %d prio %d\n", rc, + target, prio); rc = -EIO; goto fail; } @@ -370,7 +371,8 @@ static int xive_spapr_configure_queue(u32 target, struct xive_q *q, u8 prio, /* Configure and enable the queue in HW */ rc = plpar_int_set_queue_config(flags, target, prio, qpage_phys, order); if (rc) { - pr_err("Error %lld setting queue for prio %d\n", rc, prio); + pr_err("Error %lld setting queue for CPU %d prio %d\n", rc, + target, prio); rc = -EIO; } else { q->qpage = qpage; @@ -389,8 +391,8 @@ static int xive_spapr_setup_queue(unsigned int cpu, struct xive_cpu *xc, if (IS_ERR(qpage)) return PTR_ERR(qpage); - return xive_spapr_configure_queue(cpu, q, prio, qpage, - xive_queue_shift); + return xive_spapr_configure_queue(get_hard_smp_processor_id(cpu), + q, prio, qpage, xive_queue_shift); } static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, @@ -399,10 +401,12 @@ static void xive_spapr_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, struct xive_q *q = &xc->queue[prio]; unsigned int alloc_order; long rc; + int hw_cpu = get_hard_smp_processor_id(cpu); - rc = plpar_int_set_queue_config(0, cpu, prio, 0, 0); + rc = plpar_int_set_queue_config(0, hw_cpu, prio, 0, 0); if (rc) - pr_err("Error %ld setting queue for prio %d\n", rc, prio); + pr_err("Error %ld setting queue for CPU %d prio %d\n", rc, + hw_cpu, prio); alloc_order = xive_alloc_order(xive_queue_shift); free_pages((unsigned long)q->qpage, alloc_order); From e7bde88cdb4f0e432398a7d29ca2a15d2c18952a Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Feb 2018 17:45:11 +1000 Subject: [PATCH 092/149] powerpc/powernv: IMC fix out of bounds memory access at shutdown The OPAL IMC driver's shutdown handler disables nest PMU counters by walking nodes and taking the first CPU out of their cpumask, which is used to index into the paca (get_hard_smp_processor_id()). This does not always do the right thing, and in particular for CPU-less nodes it returns NR_CPUS and that overruns the paca and dereferences random memory. Fix it by being more careful about checking returned CPU, and only using online CPUs. It's not clear this shutdown code makes sense after commit 885dcd709b ("powerpc/perf: Add nest IMC PMU support"), but this should not make things worse Currently the bug causes us to call OPAL with a junk CPU number. A separate patch in development to change the way pacas are allocated escalates this bug into a crash: Unable to handle kernel paging request for data at address 0x2a21af1eeb000076 Faulting instruction address: 0xc0000000000a5468 Oops: Kernel access of bad area, sig: 11 [#1] ... NIP opal_imc_counters_shutdown+0x148/0x1d0 LR opal_imc_counters_shutdown+0x134/0x1d0 Call Trace: opal_imc_counters_shutdown+0x134/0x1d0 (unreliable) platform_drv_shutdown+0x44/0x60 device_shutdown+0x1f8/0x350 kernel_restart_prepare+0x54/0x70 kernel_restart+0x28/0xc0 SyS_reboot+0x1d0/0x2c0 system_call+0x58/0x6c Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/platforms/powernv/opal-imc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/opal-imc.c b/arch/powerpc/platforms/powernv/opal-imc.c index dd4c9b8b8a81..f6f55ab4980e 100644 --- a/arch/powerpc/platforms/powernv/opal-imc.c +++ b/arch/powerpc/platforms/powernv/opal-imc.c @@ -199,9 +199,11 @@ static void disable_nest_pmu_counters(void) const struct cpumask *l_cpumask; get_online_cpus(); - for_each_online_node(nid) { + for_each_node_with_cpus(nid) { l_cpumask = cpumask_of_node(nid); - cpu = cpumask_first(l_cpumask); + cpu = cpumask_first_and(l_cpumask, cpu_online_mask); + if (cpu >= nr_cpu_ids) + continue; opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, get_hard_smp_processor_id(cpu)); } From c1e150ceb61e4a585bad156da15c33bfe89f5858 Mon Sep 17 00:00:00 2001 From: Corentin Labbe Date: Wed, 14 Feb 2018 12:17:47 +0000 Subject: [PATCH 093/149] powerpc/pseries: Add empty update_numa_cpu_lookup_table() for NUMA=n MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When CONFIG_NUMA is not set, the build fails with: arch/powerpc/platforms/pseries/hotplug-cpu.c:335:4: error: déclaration implicite de la fonction « update_numa_cpu_lookup_table » So we have to add update_numa_cpu_lookup_table() as an empty function when CONFIG_NUMA is not set. Fixes: 1d9a090783be ("powerpc/numa: Invalidate numa_cpu_lookup_table on cpu remove") Signed-off-by: Corentin Labbe Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/topology.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index 593248110902..9f421641a35c 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -81,6 +81,9 @@ static inline int numa_update_cpu_topology(bool cpus_locked) { return 0; } + +static inline void update_numa_cpu_lookup_table(unsigned int cpu, int node) {} + #endif /* CONFIG_NUMA */ #if defined(CONFIG_NUMA) && defined(CONFIG_PPC_SPLPAR) From 6e1d8ea90932f77843730ada0bfea63093b7212a Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 14 Feb 2018 14:55:24 +0300 Subject: [PATCH 094/149] platform/x86: wmi: fix off-by-one write in wmi_dev_probe() wmi_dev_probe() allocates one byte less than necessary, thus subsequent sprintf() call writes trailing zero past the end of the 'buf': BUG: KASAN: slab-out-of-bounds in vsnprintf+0xda4/0x1240 Write of size 1 at addr ffff880423529caf by task kworker/1:1/32 Call Trace: dump_stack+0xb3/0x14d print_address_description+0xd7/0x380 kasan_report+0x166/0x2b0 vsnprintf+0xda4/0x1240 sprintf+0x9b/0xd0 wmi_dev_probe+0x1c3/0x400 driver_probe_device+0x5d1/0x990 bus_for_each_drv+0x109/0x190 __device_attach+0x217/0x360 bus_probe_device+0x1ad/0x260 deferred_probe_work_func+0x10f/0x5d0 process_one_work+0xa8b/0x1dc0 worker_thread+0x20d/0x17d0 kthread+0x311/0x3d0 ret_from_fork+0x3a/0x50 Allocated by task 32: kasan_kmalloc+0xa0/0xd0 __kmalloc+0x14f/0x3e0 wmi_dev_probe+0x182/0x400 driver_probe_device+0x5d1/0x990 bus_for_each_drv+0x109/0x190 __device_attach+0x217/0x360 bus_probe_device+0x1ad/0x260 deferred_probe_work_func+0x10f/0x5d0 process_one_work+0xa8b/0x1dc0 worker_thread+0x20d/0x17d0 kthread+0x311/0x3d0 ret_from_fork+0x3a/0x50 Increment allocation size to fix this. Fixes: 44b6b7661132 ("platform/x86: wmi: create userspace interface for drivers") Signed-off-by: Andrey Ryabinin Cc: Signed-off-by: Andy Shevchenko --- drivers/platform/x86/wmi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/wmi.c b/drivers/platform/x86/wmi.c index daa68acbc900..c0c8945603cb 100644 --- a/drivers/platform/x86/wmi.c +++ b/drivers/platform/x86/wmi.c @@ -933,7 +933,7 @@ static int wmi_dev_probe(struct device *dev) goto probe_failure; } - buf = kmalloc(strlen(wdriver->driver.name) + 4, GFP_KERNEL); + buf = kmalloc(strlen(wdriver->driver.name) + 5, GFP_KERNEL); if (!buf) { ret = -ENOMEM; goto probe_string_failure; From ed5b9ba7bef7f277cbdf315e385b44e0e3b1a9ab Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Sun, 11 Feb 2018 17:18:49 +0800 Subject: [PATCH 095/149] platform/x86: ideapad-laptop: Increase timeout to wait for EC answer Lenovo E41-20 needs more time than 100ms to read VPC, the funtion keys always failed responding. Increase timeout to get the value from VPC, then the funtion keys like mic mute key work well. Signed-off-by: Aaron Ma Signed-off-by: Andy Shevchenko --- drivers/platform/x86/ideapad-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/ideapad-laptop.c b/drivers/platform/x86/ideapad-laptop.c index 5b6f18b18801..535199c9e6bc 100644 --- a/drivers/platform/x86/ideapad-laptop.c +++ b/drivers/platform/x86/ideapad-laptop.c @@ -113,7 +113,7 @@ MODULE_PARM_DESC(no_bt_rfkill, "No rfkill for bluetooth."); /* * ACPI Helpers */ -#define IDEAPAD_EC_TIMEOUT (100) /* in ms */ +#define IDEAPAD_EC_TIMEOUT (200) /* in ms */ static int read_method_int(acpi_handle handle, const char *method, int *val) { From eca39e7f0cdb9bde4003a29149fa695e876c6f73 Mon Sep 17 00:00:00 2001 From: Laszlo Toth Date: Tue, 13 Feb 2018 21:43:43 +0100 Subject: [PATCH 096/149] platform/x86: dell-laptop: fix kbd_get_state's request value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 9862b43624a5 ("platform/x86: dell-laptop: Allocate buffer on heap rather than globally") broke one request, changed it back to the original value. Tested on a Dell E6540, backlight came back. Fixes: 9862b43624a5 ("platform/x86: dell-laptop: Allocate buffer on heap rather than globally") Signed-off-by: Laszlo Toth Reviewed-by: Pali Rohár Reviewed-by: Mario Limonciello Signed-off-by: Andy Shevchenko --- drivers/platform/x86/dell-laptop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c index 2a68f59d2228..a37cff9fd8d4 100644 --- a/drivers/platform/x86/dell-laptop.c +++ b/drivers/platform/x86/dell-laptop.c @@ -1279,7 +1279,7 @@ static int kbd_get_state(struct kbd_state *state) struct calling_interface_buffer buffer; int ret; - dell_fill_request(&buffer, 0, 0, 0, 0); + dell_fill_request(&buffer, 0x1, 0, 0, 0); ret = dell_send_request(&buffer, CLASS_KBD_BACKLIGHT, SELECT_KBD_BACKLIGHT); if (ret) From c8ba9db2a790c0fcf2f6c4cafd45ff3a0751800e Mon Sep 17 00:00:00 2001 From: Alexander Abrosimov Date: Thu, 8 Feb 2018 01:12:26 +0300 Subject: [PATCH 097/149] platform/x86: dell-laptop: Removed duplicates in DMI whitelist MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed a mistake in which several entries were duplicated in the DMI list from the below commit fe486138 platform/x86: dell-laptop: Add 2-in-1 devices to the DMI whitelist Signed-off-by: Alexander Abrosimov Reviewed-by: Pali Rohár Signed-off-by: Andy Shevchenko --- drivers/platform/x86/dell-laptop.c | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/drivers/platform/x86/dell-laptop.c b/drivers/platform/x86/dell-laptop.c index a37cff9fd8d4..c52c6723374b 100644 --- a/drivers/platform/x86/dell-laptop.c +++ b/drivers/platform/x86/dell-laptop.c @@ -126,24 +126,6 @@ static const struct dmi_system_id dell_device_table[] __initconst = { DMI_MATCH(DMI_CHASSIS_TYPE, "32"), /*Detachable*/ }, }, - { - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_CHASSIS_TYPE, "30"), /*Tablet*/ - }, - }, - { - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_CHASSIS_TYPE, "31"), /*Convertible*/ - }, - }, - { - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), - DMI_MATCH(DMI_CHASSIS_TYPE, "32"), /*Detachable*/ - }, - }, { .ident = "Dell Computer Corporation", .matches = { From 0b7c1528fb741803396da68a9d8d285ff7db731c Mon Sep 17 00:00:00 2001 From: William Cohen Date: Tue, 30 Jan 2018 22:28:13 -0500 Subject: [PATCH 098/149] perf vendor events aarch64: Add JSON metrics for ARM Cortex-A53 Processor Add JSON metrics for ARM Cortex-A53 Processor. Unlike the Intel processors there isn't a script that automatically generated these files. The patch was manually generated from the documentation and the previous oprofile ARM Cortex ac53 event file patch I made. The relevant documentation is in the "12.9 Events" section of the ARM Cortex A53 MPCore Processor Revision: r0p4 Technical Reference Manual. The ARM Cortex A53 manual is available at: http://infocenter.arm.com/help/topic/com.arm.doc.ddi0500g/DDI0500G_cortex_a53_trm.pdf Use that to look for additional information about the events. Signed-off-by: William Cohen Cc: Alexander Shishkin Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20180131032813.9564-1-wcohen@redhat.com [ Added references provided by William Cohen ] Signed-off-by: Arnaldo Carvalho de Melo --- .../arch/arm64/cortex-a53/branch.json | 27 ++++++++++ .../pmu-events/arch/arm64/cortex-a53/bus.json | 22 ++++++++ .../arch/arm64/cortex-a53/cache.json | 27 ++++++++++ .../arch/arm64/cortex-a53/memory.json | 22 ++++++++ .../arch/arm64/cortex-a53/other.json | 32 ++++++++++++ .../arch/arm64/cortex-a53/pipeline.json | 52 +++++++++++++++++++ tools/perf/pmu-events/arch/arm64/mapfile.csv | 1 + 7 files changed, 183 insertions(+) create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/other.json create mode 100644 tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json new file mode 100644 index 000000000000..3b6208763e50 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/branch.json @@ -0,0 +1,27 @@ +[ + {, + "EventCode": "0x7A", + "EventName": "BR_INDIRECT_SPEC", + "BriefDescription": "Branch speculatively executed - Indirect branch" + }, + {, + "EventCode": "0xC9", + "EventName": "BR_COND", + "BriefDescription": "Conditional branch executed" + }, + {, + "EventCode": "0xCA", + "EventName": "BR_INDIRECT_MISPRED", + "BriefDescription": "Indirect branch mispredicted" + }, + {, + "EventCode": "0xCB", + "EventName": "BR_INDIRECT_MISPRED_ADDR", + "BriefDescription": "Indirect branch mispredicted because of address miscompare" + }, + {, + "EventCode": "0xCC", + "EventName": "BR_COND_MISPRED", + "BriefDescription": "Conditional branch mispredicted" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json new file mode 100644 index 000000000000..480d9f7460ab --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/bus.json @@ -0,0 +1,22 @@ +[ + {, + "EventCode": "0x60", + "EventName": "BUS_ACCESS_LD", + "BriefDescription": "Bus access - Read" + }, + {, + "EventCode": "0x61", + "EventName": "BUS_ACCESS_ST", + "BriefDescription": "Bus access - Write" + }, + {, + "EventCode": "0xC0", + "EventName": "EXT_MEM_REQ", + "BriefDescription": "External memory request" + }, + {, + "EventCode": "0xC1", + "EventName": "EXT_MEM_REQ_NC", + "BriefDescription": "Non-cacheable external memory request" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json new file mode 100644 index 000000000000..11baad6344b9 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/cache.json @@ -0,0 +1,27 @@ +[ + {, + "EventCode": "0xC2", + "EventName": "PREFETCH_LINEFILL", + "BriefDescription": "Linefill because of prefetch" + }, + {, + "EventCode": "0xC3", + "EventName": "PREFETCH_LINEFILL_DROP", + "BriefDescription": "Instruction Cache Throttle occurred" + }, + {, + "EventCode": "0xC4", + "EventName": "READ_ALLOC_ENTER", + "BriefDescription": "Entering read allocate mode" + }, + {, + "EventCode": "0xC5", + "EventName": "READ_ALLOC", + "BriefDescription": "Read allocate mode" + }, + {, + "EventCode": "0xC8", + "EventName": "EXT_SNOOP", + "BriefDescription": "SCU Snooped data from another CPU for this CPU" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json new file mode 100644 index 000000000000..480d9f7460ab --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/memory.json @@ -0,0 +1,22 @@ +[ + {, + "EventCode": "0x60", + "EventName": "BUS_ACCESS_LD", + "BriefDescription": "Bus access - Read" + }, + {, + "EventCode": "0x61", + "EventName": "BUS_ACCESS_ST", + "BriefDescription": "Bus access - Write" + }, + {, + "EventCode": "0xC0", + "EventName": "EXT_MEM_REQ", + "BriefDescription": "External memory request" + }, + {, + "EventCode": "0xC1", + "EventName": "EXT_MEM_REQ_NC", + "BriefDescription": "Non-cacheable external memory request" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/other.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/other.json new file mode 100644 index 000000000000..73a22402d003 --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/other.json @@ -0,0 +1,32 @@ +[ + {, + "EventCode": "0x86", + "EventName": "EXC_IRQ", + "BriefDescription": "Exception taken, IRQ" + }, + {, + "EventCode": "0x87", + "EventName": "EXC_FIQ", + "BriefDescription": "Exception taken, FIQ" + }, + {, + "EventCode": "0xC6", + "EventName": "PRE_DECODE_ERR", + "BriefDescription": "Pre-decode error" + }, + {, + "EventCode": "0xD0", + "EventName": "L1I_CACHE_ERR", + "BriefDescription": "L1 Instruction Cache (data or tag) memory error" + }, + {, + "EventCode": "0xD1", + "EventName": "L1D_CACHE_ERR", + "BriefDescription": "L1 Data Cache (data, tag or dirty) memory error, correctable or non-correctable" + }, + {, + "EventCode": "0xD2", + "EventName": "TLB_ERR", + "BriefDescription": "TLB memory error" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json b/tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json new file mode 100644 index 000000000000..3149fb90555a --- /dev/null +++ b/tools/perf/pmu-events/arch/arm64/cortex-a53/pipeline.json @@ -0,0 +1,52 @@ +[ + {, + "EventCode": "0xC7", + "EventName": "STALL_SB_FULL", + "BriefDescription": "Data Write operation that stalls the pipeline because the store buffer is full" + }, + {, + "EventCode": "0xE0", + "EventName": "OTHER_IQ_DEP_STALL", + "BriefDescription": "Cycles that the DPU IQ is empty and that is not because of a recent micro-TLB miss, instruction cache miss or pre-decode error" + }, + {, + "EventCode": "0xE1", + "EventName": "IC_DEP_STALL", + "BriefDescription": "Cycles the DPU IQ is empty and there is an instruction cache miss being processed" + }, + {, + "EventCode": "0xE2", + "EventName": "IUTLB_DEP_STALL", + "BriefDescription": "Cycles the DPU IQ is empty and there is an instruction micro-TLB miss being processed" + }, + {, + "EventCode": "0xE3", + "EventName": "DECODE_DEP_STALL", + "BriefDescription": "Cycles the DPU IQ is empty and there is a pre-decode error being processed" + }, + {, + "EventCode": "0xE4", + "EventName": "OTHER_INTERLOCK_STALL", + "BriefDescription": "Cycles there is an interlock other than Advanced SIMD/Floating-point instructions or load/store instruction" + }, + {, + "EventCode": "0xE5", + "EventName": "AGU_DEP_STALL", + "BriefDescription": "Cycles there is an interlock for a load/store instruction waiting for data to calculate the address in the AGU" + }, + {, + "EventCode": "0xE6", + "EventName": "SIMD_DEP_STALL", + "BriefDescription": "Cycles there is an interlock for an Advanced SIMD/Floating-point operation." + }, + {, + "EventCode": "0xE7", + "EventName": "LD_DEP_STALL", + "BriefDescription": "Cycles there is a stall in the Wr stage because of a load miss" + }, + {, + "EventCode": "0xE8", + "EventName": "ST_DEP_STALL", + "BriefDescription": "Cycles there is a stall in the Wr stage because of a store" + } +] diff --git a/tools/perf/pmu-events/arch/arm64/mapfile.csv b/tools/perf/pmu-events/arch/arm64/mapfile.csv index 219d6756134e..e61c9ca6cf9e 100644 --- a/tools/perf/pmu-events/arch/arm64/mapfile.csv +++ b/tools/perf/pmu-events/arch/arm64/mapfile.csv @@ -13,3 +13,4 @@ # #Family-model,Version,Filename,EventType 0x00000000420f5160,v1,cavium,core +0x00000000410fd03[[:xdigit:]],v1,cortex-a53,core From 6888ff66c44ffa3077ed69e978902d0ff4b84ae1 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:16 -0800 Subject: [PATCH 099/149] perf evlist: Remove stale mmap read for backward perf_evlist__mmap_read_catchup() and perf_evlist__mmap_read_backward() are only for overwrite mode. But they read the evlist->mmap buffer which is for non-overwrite mode. It did not bring any serious problem yet, because there is no one use it. Remove the unused interfaces. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Acked-by: Wang Nan Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1516310792-208685-2-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 17 ----------------- tools/perf/util/evlist.h | 4 ---- 2 files changed, 21 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index ac35cd214feb..e5fc14e53c05 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -715,28 +715,11 @@ union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int return perf_mmap__read_forward(md); } -union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, int idx) -{ - struct perf_mmap *md = &evlist->mmap[idx]; - - /* - * No need to check messup for backward ring buffer: - * We can always read arbitrary long data from a backward - * ring buffer unless we forget to pause it before reading. - */ - return perf_mmap__read_backward(md); -} - union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx) { return perf_evlist__mmap_read_forward(evlist, idx); } -void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx) -{ - perf_mmap__read_catchup(&evlist->mmap[idx]); -} - void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx) { perf_mmap__consume(&evlist->mmap[idx], false); diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 75f8e0ad5d76..336b838e6957 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -133,10 +133,6 @@ union perf_event *perf_evlist__mmap_read(struct perf_evlist *evlist, int idx); union perf_event *perf_evlist__mmap_read_forward(struct perf_evlist *evlist, int idx); -union perf_event *perf_evlist__mmap_read_backward(struct perf_evlist *evlist, - int idx); -void perf_evlist__mmap_read_catchup(struct perf_evlist *evlist, int idx); - void perf_evlist__mmap_consume(struct perf_evlist *evlist, int idx); int perf_evlist__open(struct perf_evlist *evlist); From dc6c35c679e96987dc83a003f30bc2cc33c84c00 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:17 -0800 Subject: [PATCH 100/149] perf mmap: Recalculate size for overwrite mode In perf_mmap__push(), the 'size' need to be recalculated, otherwise the invalid data might be pushed to the record in overwrite mode. The issue is introduced by commit 7fb4b407a124 ("perf mmap: Don't discard prev in backward mode"). When the ring buffer is full in overwrite mode, backward_rb_find_range() will be called to recalculate the 'start' and 'end'. The 'size' needs to be recalculated accordingly. Unconditionally recalculate the 'size', not just for full ring buffer in overwrite mode. Because: - There is no harmful to recalculate the 'size' for other cases. - The code of calculating 'start' and 'end' will be factored out later. The new function does not need to return 'size'. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Fixes: 7fb4b407a124 ("perf mmap: Don't discard prev in backward mode") Link: http://lkml.kernel.org/r/1516310792-208685-3-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 05076e683938..97cf4fab564b 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -302,6 +302,8 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, return -1; } + size = end - start; + if ((start & md->mask) + size != (end & md->mask)) { buf = &data[start & md->mask]; size = md->mask + 1 - (start & md->mask); From f92c8cbe597a5a2ccec702dff824f3fe0f3623eb Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:18 -0800 Subject: [PATCH 101/149] perf mmap: Cleanup perf_mmap__push() The first assignment for 'start' and 'end' is redundant. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-4-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 97cf4fab564b..fbbbe87f0308 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -272,7 +272,7 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, { u64 head = perf_mmap__read_head(md); u64 old = md->prev; - u64 end = head, start = old; + u64 end, start; unsigned char *data = md->base + page_size; unsigned long size; void *buf; From 8872481bd04850b19e053dc579de5a11b83b16fc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:19 -0800 Subject: [PATCH 102/149] perf mmap: Introduce perf_mmap__read_init() The new function perf_mmap__read_init() is factored out from perf_mmap__push(). It is to calculate the 'start' and 'end' of the available data in ringbuffer. No functional change. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-5-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 37 +++++++++++++++++++++++++++---------- tools/perf/util/mmap.h | 2 ++ 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index fbbbe87f0308..c19a4e640e8e 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -267,24 +267,24 @@ static int overwrite_rb_find_range(void *buf, int mask, u64 head, u64 *start, u6 return -1; } -int perf_mmap__push(struct perf_mmap *md, bool overwrite, - void *to, int push(void *to, void *buf, size_t size)) +/* + * Report the start and end of the available data in ringbuffer + */ +int perf_mmap__read_init(struct perf_mmap *md, bool overwrite, + u64 *startp, u64 *endp) { u64 head = perf_mmap__read_head(md); u64 old = md->prev; - u64 end, start; unsigned char *data = md->base + page_size; unsigned long size; - void *buf; - int rc = 0; - start = overwrite ? head : old; - end = overwrite ? old : head; + *startp = overwrite ? head : old; + *endp = overwrite ? old : head; - if (start == end) + if (*startp == *endp) return 0; - size = end - start; + size = *endp - *startp; if (size > (unsigned long)(md->mask) + 1) { if (!overwrite) { WARN_ONCE(1, "failed to keep up with mmap data. (warn only once)\n"); @@ -298,10 +298,27 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, * Backward ring buffer is full. We still have a chance to read * most of data from it. */ - if (overwrite_rb_find_range(data, md->mask, head, &start, &end)) + if (overwrite_rb_find_range(data, md->mask, head, startp, endp)) return -1; } + return 1; +} + +int perf_mmap__push(struct perf_mmap *md, bool overwrite, + void *to, int push(void *to, void *buf, size_t size)) +{ + u64 head = perf_mmap__read_head(md); + u64 end, start; + unsigned char *data = md->base + page_size; + unsigned long size; + void *buf; + int rc = 0; + + rc = perf_mmap__read_init(md, overwrite, &start, &end); + if (rc < 1) + return rc; + size = end - start; if ((start & md->mask) + size != (end & md->mask)) { diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index e43d7b55a55f..9ab2b48df65b 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -94,4 +94,6 @@ int perf_mmap__push(struct perf_mmap *md, bool backward, size_t perf_mmap__mmap_len(struct perf_mmap *map); +int perf_mmap__read_init(struct perf_mmap *md, bool overwrite, + u64 *startp, u64 *endp); #endif /*__PERF_MMAP_H */ From 189f2cc91f9f2efef5d5f4dde43684c01b5f6f2f Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:20 -0800 Subject: [PATCH 103/149] perf mmap: Add new return value logic for perf_mmap__read_init() Improve the readability by using meaningful enum (-EAGAIN, -EINVAL and 0) to replace the three returning states (0, -1 and 1). Suggested-by: Wang Nan Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-6-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index c19a4e640e8e..38fa69dc635e 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -282,7 +282,7 @@ int perf_mmap__read_init(struct perf_mmap *md, bool overwrite, *endp = overwrite ? old : head; if (*startp == *endp) - return 0; + return -EAGAIN; size = *endp - *startp; if (size > (unsigned long)(md->mask) + 1) { @@ -291,7 +291,7 @@ int perf_mmap__read_init(struct perf_mmap *md, bool overwrite, md->prev = head; perf_mmap__consume(md, overwrite); - return 0; + return -EAGAIN; } /* @@ -299,10 +299,10 @@ int perf_mmap__read_init(struct perf_mmap *md, bool overwrite, * most of data from it. */ if (overwrite_rb_find_range(data, md->mask, head, startp, endp)) - return -1; + return -EINVAL; } - return 1; + return 0; } int perf_mmap__push(struct perf_mmap *md, bool overwrite, @@ -316,8 +316,8 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, int rc = 0; rc = perf_mmap__read_init(md, overwrite, &start, &end); - if (rc < 1) - return rc; + if (rc < 0) + return (rc == -EAGAIN) ? 0 : -1; size = end - start; From b4b036b4c76341a5034e872aca3727c4988a7304 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:21 -0800 Subject: [PATCH 104/149] perf mmap: Discard 'prev' in perf_mmap__read() The 'start' and 'prev' variables are duplicates in perf_mmap__read(). Use 'map->prev' to replace 'start' in perf_mmap__read_*(). Suggested-by: Wang Nan Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-7-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 28 ++++++++++------------------ 1 file changed, 10 insertions(+), 18 deletions(-) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 38fa69dc635e..125bfda9d037 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -22,29 +22,27 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map) /* When check_messup is true, 'end' must points to a good entry */ static union perf_event *perf_mmap__read(struct perf_mmap *map, - u64 start, u64 end, u64 *prev) + u64 *startp, u64 end) { unsigned char *data = map->base + page_size; union perf_event *event = NULL; - int diff = end - start; + int diff = end - *startp; if (diff >= (int)sizeof(event->header)) { size_t size; - event = (union perf_event *)&data[start & map->mask]; + event = (union perf_event *)&data[*startp & map->mask]; size = event->header.size; - if (size < sizeof(event->header) || diff < (int)size) { - event = NULL; - goto broken_event; - } + if (size < sizeof(event->header) || diff < (int)size) + return NULL; /* * Event straddles the mmap boundary -- header should always * be inside due to u64 alignment of output. */ - if ((start & map->mask) + size != ((start + size) & map->mask)) { - unsigned int offset = start; + if ((*startp & map->mask) + size != ((*startp + size) & map->mask)) { + unsigned int offset = *startp; unsigned int len = min(sizeof(*event), size), cpy; void *dst = map->event_copy; @@ -59,20 +57,15 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map, event = (union perf_event *)map->event_copy; } - start += size; + *startp += size; } -broken_event: - if (prev) - *prev = start; - return event; } union perf_event *perf_mmap__read_forward(struct perf_mmap *map) { u64 head; - u64 old = map->prev; /* * Check if event was unmapped due to a POLLHUP/POLLERR. @@ -82,13 +75,12 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map) head = perf_mmap__read_head(map); - return perf_mmap__read(map, old, head, &map->prev); + return perf_mmap__read(map, &map->prev, head); } union perf_event *perf_mmap__read_backward(struct perf_mmap *map) { u64 head, end; - u64 start = map->prev; /* * Check if event was unmapped due to a POLLHUP/POLLERR. @@ -118,7 +110,7 @@ union perf_event *perf_mmap__read_backward(struct perf_mmap *map) else end = head + map->mask + 1; - return perf_mmap__read(map, start, end, &map->prev); + return perf_mmap__read(map, &map->prev, end); } void perf_mmap__read_catchup(struct perf_mmap *map) From ee023de05f35484691f7d9e5c1f92195ac4d64d2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:22 -0800 Subject: [PATCH 105/149] perf mmap: Introduce perf_mmap__read_done() The direction of overwrite mode is backward. The last perf_mmap__read() will set tail to map->prev. Need to correct the map->prev to head which is the end of next read. It will be used later. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-8-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 11 +++++++++++ tools/perf/util/mmap.h | 1 + 2 files changed, 12 insertions(+) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 125bfda9d037..4f59eaefc706 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -338,3 +338,14 @@ int perf_mmap__push(struct perf_mmap *md, bool overwrite, out: return rc; } + +/* + * Mandatory for overwrite mode + * The direction of overwrite mode is backward. + * The last perf_mmap__read() will set tail to map->prev. + * Need to correct the map->prev to head which is the end of next read. + */ +void perf_mmap__read_done(struct perf_mmap *map) +{ + map->prev = perf_mmap__read_head(map); +} diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 9ab2b48df65b..95549d4af943 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -96,4 +96,5 @@ size_t perf_mmap__mmap_len(struct perf_mmap *map); int perf_mmap__read_init(struct perf_mmap *md, bool overwrite, u64 *startp, u64 *endp); +void perf_mmap__read_done(struct perf_mmap *map); #endif /*__PERF_MMAP_H */ From 7bb45972952db9298fe5cc440160dcad1a66bfbc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:23 -0800 Subject: [PATCH 106/149] perf mmap: Introduce perf_mmap__read_event() Except for 'perf record', the other perf tools read events one by one from the ring buffer using perf_mmap__read_forward(). But it only supports non-overwrite mode. Introduce perf_mmap__read_event() to support both non-overwrite and overwrite mode. Usage: perf_mmap__read_init() while(event = perf_mmap__read_event()) { //process the event perf_mmap__consume() } perf_mmap__read_done() It cannot use perf_mmap__read_backward(). Because it always reads the stale buffer which is already processed. Furthermore, the forward and backward concepts have been removed. The perf_mmap__read_backward() will be replaced and discarded later. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-9-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 39 +++++++++++++++++++++++++++++++++++++++ tools/perf/util/mmap.h | 4 ++++ 2 files changed, 43 insertions(+) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index 4f59eaefc706..f804926778b7 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -113,6 +113,45 @@ union perf_event *perf_mmap__read_backward(struct perf_mmap *map) return perf_mmap__read(map, &map->prev, end); } +/* + * Read event from ring buffer one by one. + * Return one event for each call. + * + * Usage: + * perf_mmap__read_init() + * while(event = perf_mmap__read_event()) { + * //process the event + * perf_mmap__consume() + * } + * perf_mmap__read_done() + */ +union perf_event *perf_mmap__read_event(struct perf_mmap *map, + bool overwrite, + u64 *startp, u64 end) +{ + union perf_event *event; + + /* + * Check if event was unmapped due to a POLLHUP/POLLERR. + */ + if (!refcount_read(&map->refcnt)) + return NULL; + + if (startp == NULL) + return NULL; + + /* non-overwirte doesn't pause the ringbuffer */ + if (!overwrite) + end = perf_mmap__read_head(map); + + event = perf_mmap__read(map, startp, end); + + if (!overwrite) + map->prev = *startp; + + return event; +} + void perf_mmap__read_catchup(struct perf_mmap *map) { u64 head; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 95549d4af943..28718543dd42 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -89,6 +89,10 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) union perf_event *perf_mmap__read_forward(struct perf_mmap *map); union perf_event *perf_mmap__read_backward(struct perf_mmap *map); +union perf_event *perf_mmap__read_event(struct perf_mmap *map, + bool overwrite, + u64 *startp, u64 end); + int perf_mmap__push(struct perf_mmap *md, bool backward, void *to, int push(void *to, void *buf, size_t size)); From 600a7cfe88de2c6e44e23d61dd721b996b790eb2 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:24 -0800 Subject: [PATCH 107/149] perf test: Update mmap read functions for backward-ring-buffer test Use the new perf_mmap__read_* interfaces for overwrite ringbuffer test. Commiter notes: Testing: [root@seventh ~]# perf test -v backward 48: Read backward ring buffer : --- start --- test child forked, pid 8309 Using CPUID GenuineIntel-6-9E mmap size 1052672B mmap size 8192B Finished reading overwrite ring buffer: rewind test child finished with 0 ---- end ---- Read backward ring buffer: Ok [root@seventh ~]# Signed-off-by: Kan Liang Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-10-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/backward-ring-buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/backward-ring-buffer.c b/tools/perf/tests/backward-ring-buffer.c index 4035d43523c3..e0b1b414d466 100644 --- a/tools/perf/tests/backward-ring-buffer.c +++ b/tools/perf/tests/backward-ring-buffer.c @@ -31,10 +31,12 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count, int i; for (i = 0; i < evlist->nr_mmaps; i++) { + struct perf_mmap *map = &evlist->overwrite_mmap[i]; union perf_event *event; + u64 start, end; - perf_mmap__read_catchup(&evlist->overwrite_mmap[i]); - while ((event = perf_mmap__read_backward(&evlist->overwrite_mmap[i])) != NULL) { + perf_mmap__read_init(map, true, &start, &end); + while ((event = perf_mmap__read_event(map, true, &start, end)) != NULL) { const u32 type = event->header.type; switch (type) { @@ -49,6 +51,7 @@ static int count_samples(struct perf_evlist *evlist, int *sample_count, return TEST_FAIL; } } + perf_mmap__read_done(map); } return TEST_OK; } From 3effc2f165a842d640873e29d4c5cc1650143aef Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:25 -0800 Subject: [PATCH 108/149] perf mmap: Discard legacy interface for mmap read Discards perf_mmap__read_backward() and perf_mmap__read_catchup(). No tools use them. There are tools still use perf_mmap__read_forward(). Keep it, but add comments to point to the new interface for future use. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-11-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/mmap.c | 50 ++++-------------------------------------- tools/perf/util/mmap.h | 3 --- 2 files changed, 4 insertions(+), 49 deletions(-) diff --git a/tools/perf/util/mmap.c b/tools/perf/util/mmap.c index f804926778b7..91531a7c8fbf 100644 --- a/tools/perf/util/mmap.c +++ b/tools/perf/util/mmap.c @@ -63,6 +63,10 @@ static union perf_event *perf_mmap__read(struct perf_mmap *map, return event; } +/* + * legacy interface for mmap read. + * Don't use it. Use perf_mmap__read_event(). + */ union perf_event *perf_mmap__read_forward(struct perf_mmap *map) { u64 head; @@ -78,41 +82,6 @@ union perf_event *perf_mmap__read_forward(struct perf_mmap *map) return perf_mmap__read(map, &map->prev, head); } -union perf_event *perf_mmap__read_backward(struct perf_mmap *map) -{ - u64 head, end; - - /* - * Check if event was unmapped due to a POLLHUP/POLLERR. - */ - if (!refcount_read(&map->refcnt)) - return NULL; - - head = perf_mmap__read_head(map); - if (!head) - return NULL; - - /* - * 'head' pointer starts from 0. Kernel minus sizeof(record) form - * it each time when kernel writes to it, so in fact 'head' is - * negative. 'end' pointer is made manually by adding the size of - * the ring buffer to 'head' pointer, means the validate data can - * read is the whole ring buffer. If 'end' is positive, the ring - * buffer has not fully filled, so we must adjust 'end' to 0. - * - * However, since both 'head' and 'end' is unsigned, we can't - * simply compare 'end' against 0. Here we compare '-head' and - * the size of the ring buffer, where -head is the number of bytes - * kernel write to the ring buffer. - */ - if (-head < (u64)(map->mask + 1)) - end = 0; - else - end = head + map->mask + 1; - - return perf_mmap__read(map, &map->prev, end); -} - /* * Read event from ring buffer one by one. * Return one event for each call. @@ -152,17 +121,6 @@ union perf_event *perf_mmap__read_event(struct perf_mmap *map, return event; } -void perf_mmap__read_catchup(struct perf_mmap *map) -{ - u64 head; - - if (!refcount_read(&map->refcnt)) - return; - - head = perf_mmap__read_head(map); - map->prev = head; -} - static bool perf_mmap__empty(struct perf_mmap *map) { return perf_mmap__read_head(map) == map->prev && !map->auxtrace_mmap.base; diff --git a/tools/perf/util/mmap.h b/tools/perf/util/mmap.h index 28718543dd42..ec7d3a24e276 100644 --- a/tools/perf/util/mmap.h +++ b/tools/perf/util/mmap.h @@ -65,8 +65,6 @@ void perf_mmap__put(struct perf_mmap *map); void perf_mmap__consume(struct perf_mmap *map, bool overwrite); -void perf_mmap__read_catchup(struct perf_mmap *md); - static inline u64 perf_mmap__read_head(struct perf_mmap *mm) { struct perf_event_mmap_page *pc = mm->base; @@ -87,7 +85,6 @@ static inline void perf_mmap__write_tail(struct perf_mmap *md, u64 tail) } union perf_event *perf_mmap__read_forward(struct perf_mmap *map); -union perf_event *perf_mmap__read_backward(struct perf_mmap *map); union perf_event *perf_mmap__read_event(struct perf_mmap *map, bool overwrite, From 63878a53cedc3df31bd4ba8740a49fa0fc116ac6 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:26 -0800 Subject: [PATCH 109/149] perf top: Check per-event overwrite term Per-event overwrite term is not forbidden in 'perf top', which can bring problems. Because 'perf top' only support non-overwrite mode now. Add new rules and check regarding to overwrite term for 'perf top'. - All events either have same per-event term or don't have per-event mode setting. Otherwise, it will error out. - Per-event overwrite term should be consistent as opts->overwrite. If not, updating the opts->overwrite according to per-event term. Make it possible to support either non-overwrite or overwrite mode. The overwrite mode is forbidden now, which will be removed when the overwrite mode is supported later. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-12-git-send-email-kan.liang@intel.com [ Renamed perf_top_overwrite_check to perf_top__overwrite_check, to follow existing convention ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 73 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 73 insertions(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index c6ccda52117d..17783798924a 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -881,6 +881,68 @@ static void perf_top__mmap_read(struct perf_top *top) perf_top__mmap_read_idx(top, i); } +/* + * Check per-event overwrite term. + * perf top should support consistent term for all events. + * - All events don't have per-event term + * E.g. "cpu/cpu-cycles/,cpu/instructions/" + * Nothing change, return 0. + * - All events have same per-event term + * E.g. "cpu/cpu-cycles,no-overwrite/,cpu/instructions,no-overwrite/ + * Using the per-event setting to replace the opts->overwrite if + * they are different, then return 0. + * - Events have different per-event term + * E.g. "cpu/cpu-cycles,overwrite/,cpu/instructions,no-overwrite/" + * Return -1 + * - Some of the event set per-event term, but some not. + * E.g. "cpu/cpu-cycles/,cpu/instructions,no-overwrite/" + * Return -1 + */ +static int perf_top__overwrite_check(struct perf_top *top) +{ + struct record_opts *opts = &top->record_opts; + struct perf_evlist *evlist = top->evlist; + struct perf_evsel_config_term *term; + struct list_head *config_terms; + struct perf_evsel *evsel; + int set, overwrite = -1; + + evlist__for_each_entry(evlist, evsel) { + set = -1; + config_terms = &evsel->config_terms; + list_for_each_entry(term, config_terms, list) { + if (term->type == PERF_EVSEL__CONFIG_TERM_OVERWRITE) + set = term->val.overwrite ? 1 : 0; + } + + /* no term for current and previous event (likely) */ + if ((overwrite < 0) && (set < 0)) + continue; + + /* has term for both current and previous event, compare */ + if ((overwrite >= 0) && (set >= 0) && (overwrite != set)) + return -1; + + /* no term for current event but has term for previous one */ + if ((overwrite >= 0) && (set < 0)) + return -1; + + /* has term for current event */ + if ((overwrite < 0) && (set >= 0)) { + /* if it's first event, set overwrite */ + if (evsel == perf_evlist__first(evlist)) + overwrite = set; + else + return -1; + } + } + + if ((overwrite >= 0) && (opts->overwrite != overwrite)) + opts->overwrite = overwrite; + + return 0; +} + static int perf_top__start_counters(struct perf_top *top) { char msg[BUFSIZ]; @@ -888,6 +950,17 @@ static int perf_top__start_counters(struct perf_top *top) struct perf_evlist *evlist = top->evlist; struct record_opts *opts = &top->record_opts; + if (perf_top__overwrite_check(top)) { + ui__error("perf top only support consistent per-event " + "overwrite setting for all events\n"); + goto out_err; + } + + if (opts->overwrite) { + ui__error("not support overwrite mode yet\n"); + goto out_err; + } + perf_evlist__config(evlist, opts, &callchain_param); evlist__for_each_entry(evlist, counter) { From 9a831b3a32c5daf5d7cc672334d51930f78e4ea3 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 2 Feb 2018 11:27:25 -0300 Subject: [PATCH 110/149] perf evsel: Expose the perf_missing_features struct As tools may need to adjust to missing features, as 'perf top' will, in the next csets, to cope with a missing 'write_backward' feature. Cc: Andi Kleen Cc: Jin Yao Cc: Jiri Olsa Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-jelngl9q1ooaizvkcput9tic@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evsel.c | 12 +----------- tools/perf/util/evsel.h | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index ff359c9ece2e..ef351688b797 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -41,17 +41,7 @@ #include "sane_ctype.h" -static struct { - bool sample_id_all; - bool exclude_guest; - bool mmap2; - bool cloexec; - bool clockid; - bool clockid_wrong; - bool lbr_flags; - bool write_backward; - bool group_read; -} perf_missing_features; +struct perf_missing_features perf_missing_features; static clockid_t clockid; diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 846e41644525..a7487c6d1866 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -149,6 +149,20 @@ union u64_swap { u32 val32[2]; }; +struct perf_missing_features { + bool sample_id_all; + bool exclude_guest; + bool mmap2; + bool cloexec; + bool clockid; + bool clockid_wrong; + bool lbr_flags; + bool write_backward; + bool group_read; +}; + +extern struct perf_missing_features perf_missing_features; + struct cpu_map; struct target; struct thread_map; From 204721d7eabe6ee98aafce791ce3efdbc4715834 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:28 -0800 Subject: [PATCH 111/149] perf top: Add overwrite fall back Switch to non-overwrite mode if kernel doesnot support overwrite ringbuffer. It's only effect when overwrite mode is supported. No change to current behavior. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-14-git-send-email-kan.liang@intel.com [ Use perf_missing_features.write_backward instead of the non merged is_write_backward_fail() ] Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 17783798924a..ee4bba1e282c 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -943,6 +943,27 @@ static int perf_top__overwrite_check(struct perf_top *top) return 0; } +static int perf_top_overwrite_fallback(struct perf_top *top, + struct perf_evsel *evsel) +{ + struct record_opts *opts = &top->record_opts; + struct perf_evlist *evlist = top->evlist; + struct perf_evsel *counter; + + if (!opts->overwrite) + return 0; + + /* only fall back when first event fails */ + if (evsel != perf_evlist__first(evlist)) + return 0; + + evlist__for_each_entry(evlist, counter) + counter->attr.write_backward = false; + opts->overwrite = false; + ui__warning("fall back to non-overwrite mode\n"); + return 1; +} + static int perf_top__start_counters(struct perf_top *top) { char msg[BUFSIZ]; @@ -967,6 +988,21 @@ static int perf_top__start_counters(struct perf_top *top) try_again: if (perf_evsel__open(counter, top->evlist->cpus, top->evlist->threads) < 0) { + + /* + * Specially handle overwrite fall back. + * Because perf top is the only tool which has + * overwrite mode by default, support + * both overwrite and non-overwrite mode, and + * require consistent mode for all events. + * + * May move it to generic code with more tools + * have similar attribute. + */ + if (perf_missing_features.write_backward && + perf_top_overwrite_fallback(top, counter)) + goto try_again; + if (perf_evsel__fallback(counter, errno, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); From 06cc1a470ab237b991901729b125404c164f3660 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:29 -0800 Subject: [PATCH 112/149] perf hists browser: Add parameter to disable lost event warning For overwrite mode, the ringbuffer will be paused. The event lost is expected. It needs a way to notify the browser not print the warning. It will be used later for perf top to disable lost event warning in overwrite mode. There is no behavior change for now. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-15-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-c2c.c | 4 ++-- tools/perf/builtin-report.c | 3 ++- tools/perf/builtin-top.c | 2 +- tools/perf/ui/browsers/hists.c | 38 ++++++++++++++++++++++------------ tools/perf/ui/browsers/hists.h | 3 ++- tools/perf/util/hist.h | 6 ++++-- 6 files changed, 36 insertions(+), 20 deletions(-) diff --git a/tools/perf/builtin-c2c.c b/tools/perf/builtin-c2c.c index c0815a37fdb5..539c3d460158 100644 --- a/tools/perf/builtin-c2c.c +++ b/tools/perf/builtin-c2c.c @@ -2245,7 +2245,7 @@ static int perf_c2c__browse_cacheline(struct hist_entry *he) c2c_browser__update_nr_entries(browser); while (1) { - key = hist_browser__run(browser, "? - help"); + key = hist_browser__run(browser, "? - help", true); switch (key) { case 's': @@ -2314,7 +2314,7 @@ static int perf_c2c__hists_browse(struct hists *hists) c2c_browser__update_nr_entries(browser); while (1) { - key = hist_browser__run(browser, "? - help"); + key = hist_browser__run(browser, "? - help", true); switch (key) { case 'q': diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 42a52dcc41cd..4ad5dc649716 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -530,7 +530,8 @@ static int report__browse_hists(struct report *rep) case 1: ret = perf_evlist__tui_browse_hists(evlist, help, NULL, rep->min_percent, - &session->header.env); + &session->header.env, + true); /* * Usually "ret" is the last pressed key, and we only * care if the key notifies us to switch data file. diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index ee4bba1e282c..7def861a9ec4 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -611,7 +611,7 @@ static void *display_thread_tui(void *arg) perf_evlist__tui_browse_hists(top->evlist, help, &hbt, top->min_percent, - &top->session->header.env); + &top->session->header.env, true); done = 1; return NULL; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index 68146f4620a5..6495ee55d9c3 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -608,7 +608,8 @@ static int hist_browser__title(struct hist_browser *browser, char *bf, size_t si return browser->title ? browser->title(browser, bf, size) : 0; } -int hist_browser__run(struct hist_browser *browser, const char *help) +int hist_browser__run(struct hist_browser *browser, const char *help, + bool warn_lost_event) { int key; char title[160]; @@ -638,8 +639,9 @@ int hist_browser__run(struct hist_browser *browser, const char *help) nr_entries = hist_browser__nr_entries(browser); ui_browser__update_nr_entries(&browser->b, nr_entries); - if (browser->hists->stats.nr_lost_warned != - browser->hists->stats.nr_events[PERF_RECORD_LOST]) { + if (warn_lost_event && + (browser->hists->stats.nr_lost_warned != + browser->hists->stats.nr_events[PERF_RECORD_LOST])) { browser->hists->stats.nr_lost_warned = browser->hists->stats.nr_events[PERF_RECORD_LOST]; ui_browser__warn_lost_events(&browser->b); @@ -2763,7 +2765,8 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, bool left_exits, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_env *env) + struct perf_env *env, + bool warn_lost_event) { struct hists *hists = evsel__hists(evsel); struct hist_browser *browser = perf_evsel_browser__new(evsel, hbt, env); @@ -2844,7 +2847,8 @@ static int perf_evsel__hists_browse(struct perf_evsel *evsel, int nr_events, nr_options = 0; - key = hist_browser__run(browser, helpline); + key = hist_browser__run(browser, helpline, + warn_lost_event); if (browser->he_selection != NULL) { thread = hist_browser__selected_thread(browser); @@ -3184,7 +3188,8 @@ static void perf_evsel_menu__write(struct ui_browser *browser, static int perf_evsel_menu__run(struct perf_evsel_menu *menu, int nr_events, const char *help, - struct hist_browser_timer *hbt) + struct hist_browser_timer *hbt, + bool warn_lost_event) { struct perf_evlist *evlist = menu->b.priv; struct perf_evsel *pos; @@ -3203,7 +3208,9 @@ static int perf_evsel_menu__run(struct perf_evsel_menu *menu, case K_TIMER: hbt->timer(hbt->arg); - if (!menu->lost_events_warned && menu->lost_events) { + if (!menu->lost_events_warned && + menu->lost_events && + warn_lost_event) { ui_browser__warn_lost_events(&menu->b); menu->lost_events_warned = true; } @@ -3224,7 +3231,8 @@ browse_hists: key = perf_evsel__hists_browse(pos, nr_events, help, true, hbt, menu->min_pcnt, - menu->env); + menu->env, + warn_lost_event); ui_browser__show_title(&menu->b, title); switch (key) { case K_TAB: @@ -3282,7 +3290,8 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist, int nr_entries, const char *help, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_env *env) + struct perf_env *env, + bool warn_lost_event) { struct perf_evsel *pos; struct perf_evsel_menu menu = { @@ -3309,13 +3318,15 @@ static int __perf_evlist__tui_browse_hists(struct perf_evlist *evlist, menu.b.width = line_len; } - return perf_evsel_menu__run(&menu, nr_entries, help, hbt); + return perf_evsel_menu__run(&menu, nr_entries, help, + hbt, warn_lost_event); } int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_env *env) + struct perf_env *env, + bool warn_lost_event) { int nr_entries = evlist->nr_entries; @@ -3325,7 +3336,7 @@ single_entry: return perf_evsel__hists_browse(first, nr_entries, help, false, hbt, min_pcnt, - env); + env, warn_lost_event); } if (symbol_conf.event_group) { @@ -3342,5 +3353,6 @@ single_entry: } return __perf_evlist__tui_browse_hists(evlist, nr_entries, help, - hbt, min_pcnt, env); + hbt, min_pcnt, env, + warn_lost_event); } diff --git a/tools/perf/ui/browsers/hists.h b/tools/perf/ui/browsers/hists.h index ba431777f559..9428bee076f2 100644 --- a/tools/perf/ui/browsers/hists.h +++ b/tools/perf/ui/browsers/hists.h @@ -28,7 +28,8 @@ struct hist_browser { struct hist_browser *hist_browser__new(struct hists *hists); void hist_browser__delete(struct hist_browser *browser); -int hist_browser__run(struct hist_browser *browser, const char *help); +int hist_browser__run(struct hist_browser *browser, const char *help, + bool warn_lost_event); void hist_browser__init(struct hist_browser *browser, struct hists *hists); #endif /* _PERF_UI_BROWSER_HISTS_H_ */ diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index f6630cb95eff..02721b579746 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -430,7 +430,8 @@ int hist_entry__tui_annotate(struct hist_entry *he, struct perf_evsel *evsel, int perf_evlist__tui_browse_hists(struct perf_evlist *evlist, const char *help, struct hist_browser_timer *hbt, float min_pcnt, - struct perf_env *env); + struct perf_env *env, + bool warn_lost_event); int script_browse(const char *script_opt); #else static inline @@ -438,7 +439,8 @@ int perf_evlist__tui_browse_hists(struct perf_evlist *evlist __maybe_unused, const char *help __maybe_unused, struct hist_browser_timer *hbt __maybe_unused, float min_pcnt __maybe_unused, - struct perf_env *env __maybe_unused) + struct perf_env *env __maybe_unused, + bool warn_lost_event __maybe_unused) { return 0; } From a1ff5b05e988ca3620027148cd61013408ea4194 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:30 -0800 Subject: [PATCH 113/149] perf top: Remove lost events checking There would be some records lost in overwrite mode because of pausing the ringbuffer. It has little impact for the accuracy of the snapshot and could be tolerated by 'perf top'. Remove the lost events checking. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-16-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 7def861a9ec4..59653062bb48 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -283,8 +283,9 @@ static void perf_top__print_sym_table(struct perf_top *top) printf("%-*.*s\n", win_width, win_width, graph_dotted_line); - if (hists->stats.nr_lost_warned != - hists->stats.nr_events[PERF_RECORD_LOST]) { + if (!top->record_opts.overwrite && + (hists->stats.nr_lost_warned != + hists->stats.nr_events[PERF_RECORD_LOST])) { hists->stats.nr_lost_warned = hists->stats.nr_events[PERF_RECORD_LOST]; color_fprintf(stdout, PERF_COLOR_RED, @@ -611,7 +612,8 @@ static void *display_thread_tui(void *arg) perf_evlist__tui_browse_hists(top->evlist, help, &hbt, top->min_percent, - &top->session->header.env, true); + &top->session->header.env, + !top->record_opts.overwrite); done = 1; return NULL; From ebebbf082357f86cc84a4d46ce897a5750e41b7a Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:31 -0800 Subject: [PATCH 114/149] perf top: Switch default mode to overwrite mode perf_top__mmap_read() has a severe performance issue in the Knights Landing/Mill platform, when monitoring heavy load systems. It costs several minutes to finish, which is unacceptable. Currently, 'perf top' uses the non overwrite mode. For non overwrite mode, it tries to read everything in the ringbuffer and doesn't pause it. Once there are lots of samples delivered persistently, the processing time could be very long. Also, the latest samples could be lost when the ringbuffer is full. For overwrite mode, it takes a snapshot for the system by pausing the ringbuffer, which could significantly reduce the processing time. Also, the overwrite mode always keep the latest samples. Considering the real time requirement for 'perf top', the overwrite mode is more suitable for it. Actually, 'perf top' was overwrite mode. It is changed to non overwrite mode since commit 93fc64f14472 ("perf top: Switch to non overwrite mode"). It's better to change it back to overwrite mode by default. For the kernel which doesn't support overwrite mode, it will fall back to non overwrite mode. There would be some records lost in overwrite mode because of pausing the ringbuffer. It has little impact for the accuracy of the snapshot and can be tolerated. For overwrite mode, unconditionally wait 100 ms before each snapshot. It also reduces the overhead caused by pausing ringbuffer, especially on light load system. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Tested-by: Arnaldo Carvalho de Melo Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-17-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 59653062bb48..2b4914f34ed6 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -809,15 +809,23 @@ static void perf_event__process_sample(struct perf_tool *tool, static void perf_top__mmap_read_idx(struct perf_top *top, int idx) { + struct record_opts *opts = &top->record_opts; + struct perf_evlist *evlist = top->evlist; struct perf_sample sample; struct perf_evsel *evsel; + struct perf_mmap *md; struct perf_session *session = top->session; union perf_event *event; struct machine *machine; + u64 end, start; int ret; - while ((event = perf_evlist__mmap_read(top->evlist, idx)) != NULL) { - ret = perf_evlist__parse_sample(top->evlist, event, &sample); + md = opts->overwrite ? &evlist->overwrite_mmap[idx] : &evlist->mmap[idx]; + if (perf_mmap__read_init(md, opts->overwrite, &start, &end) < 0) + return; + + while ((event = perf_mmap__read_event(md, opts->overwrite, &start, end)) != NULL) { + ret = perf_evlist__parse_sample(evlist, event, &sample); if (ret) { pr_err("Can't parse sample, err = %d\n", ret); goto next_event; @@ -871,16 +879,28 @@ static void perf_top__mmap_read_idx(struct perf_top *top, int idx) } else ++session->evlist->stats.nr_unknown_events; next_event: - perf_evlist__mmap_consume(top->evlist, idx); + perf_mmap__consume(md, opts->overwrite); } + + perf_mmap__read_done(md); } static void perf_top__mmap_read(struct perf_top *top) { + bool overwrite = top->record_opts.overwrite; + struct perf_evlist *evlist = top->evlist; int i; + if (overwrite) + perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING); + for (i = 0; i < top->evlist->nr_mmaps; i++) perf_top__mmap_read_idx(top, i); + + if (overwrite) { + perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); + perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING); + } } /* @@ -979,11 +999,6 @@ static int perf_top__start_counters(struct perf_top *top) goto out_err; } - if (opts->overwrite) { - ui__error("not support overwrite mode yet\n"); - goto out_err; - } - perf_evlist__config(evlist, opts, &callchain_param); evlist__for_each_entry(evlist, counter) { @@ -1144,7 +1159,7 @@ static int __cmd_top(struct perf_top *top) perf_top__mmap_read(top); - if (hits == top->samples) + if (opts->overwrite || (hits == top->samples)) ret = perf_evlist__poll(top->evlist, 100); if (resize) { @@ -1238,6 +1253,7 @@ int cmd_top(int argc, const char **argv) .uses_mmap = true, }, .proc_map_timeout = 500, + .overwrite = 1, }, .max_stack = sysctl_perf_event_max_stack, .sym_pcnt_filter = 5, From 8cc42de736b617827a4e7664fb8d7a325bc125bc Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 18 Jan 2018 13:26:32 -0800 Subject: [PATCH 115/149] perf top: Check the latency of perf_top__mmap_read() The latency of perf_top__mmap_read() should be lower than refresh time. If not, give some hints to reduce the latency. Signed-off-by: Kan Liang Acked-by: Jiri Olsa Cc: Andi Kleen Cc: Jin Yao Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Wang Nan Link: http://lkml.kernel.org/r/1516310792-208685-18-git-send-email-kan.liang@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-top.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 2b4914f34ed6..b7c823ba8374 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -889,8 +889,10 @@ static void perf_top__mmap_read(struct perf_top *top) { bool overwrite = top->record_opts.overwrite; struct perf_evlist *evlist = top->evlist; + unsigned long long start, end; int i; + start = rdclock(); if (overwrite) perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_DATA_PENDING); @@ -901,6 +903,13 @@ static void perf_top__mmap_read(struct perf_top *top) perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_EMPTY); perf_evlist__toggle_bkw_mmap(evlist, BKW_MMAP_RUNNING); } + end = rdclock(); + + if ((end - start) > (unsigned long long)top->delay_secs * NSEC_PER_SEC) + ui__warning("Too slow to read ring buffer.\n" + "Please try increasing the period (-c) or\n" + "decreasing the freq (-F) or\n" + "limiting the number of CPUs (-C)\n"); } /* From 6677d26c8befa462eab9be6c5335a939011e7e65 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 29 Jan 2018 15:03:59 +0200 Subject: [PATCH 116/149] perf tools: Substitute yet another strtoull() Instead of home grown function let's use what library provides us. Signed-off-by: Andriy Shevchenko Acked-by: Jiri Olsa Link: http://lkml.kernel.org/r/20180129130359.1490-1-andriy.shevchenko@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/util.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/tools/perf/util/util.c b/tools/perf/util/util.c index 443892dabedb..1019bbc5dbd8 100644 --- a/tools/perf/util/util.c +++ b/tools/perf/util/util.c @@ -340,35 +340,15 @@ size_t hex_width(u64 v) return n; } -static int hex(char ch) -{ - if ((ch >= '0') && (ch <= '9')) - return ch - '0'; - if ((ch >= 'a') && (ch <= 'f')) - return ch - 'a' + 10; - if ((ch >= 'A') && (ch <= 'F')) - return ch - 'A' + 10; - return -1; -} - /* * While we find nice hex chars, build a long_val. * Return number of chars processed. */ int hex2u64(const char *ptr, u64 *long_val) { - const char *p = ptr; - *long_val = 0; + char *p; - while (*p) { - const int hex_val = hex(*p); - - if (hex_val < 0) - break; - - *long_val = (*long_val << 4) | hex_val; - p++; - } + *long_val = strtoull(ptr, &p, 16); return p - ptr; } From ba7e851642f48002def3450b279598c187721fd0 Mon Sep 17 00:00:00 2001 From: Sangwon Hong Date: Mon, 5 Feb 2018 20:48:35 +0900 Subject: [PATCH 117/149] perf data: Document missing --force option Add the --force option to the man page. Signed-off-by: Sangwon Hong Cc: Jiri Olsa Cc: Namhyung Kim Cc: Taeung Song Link: http://lkml.kernel.org/r/1517831315-31490-1-git-send-email-qpakzk@gmail.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Documentation/perf-data.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/Documentation/perf-data.txt b/tools/perf/Documentation/perf-data.txt index f0796a47dfa3..90bb4aabe4f8 100644 --- a/tools/perf/Documentation/perf-data.txt +++ b/tools/perf/Documentation/perf-data.txt @@ -30,6 +30,10 @@ OPTIONS for 'convert' -i:: Specify input perf data file path. +-f:: +--force:: + Don't complain, do it. + -v:: --verbose:: Be more verbose (show counter open errors, etc). From 7a92453620d42c3a5fea94a864dc6aa04c262b93 Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Wed, 17 Jan 2018 09:38:31 +0100 Subject: [PATCH 118/149] perf test: Fix test trace+probe_libc_inet_pton.sh for s390x On Intel test case trace+probe_libc_inet_pton.sh succeeds and the output is: [root@f27 perf]# ./perf trace --no-syscalls -e probe_libc:inet_pton/max-stack=3/ ping -6 -c 1 ::1 PING ::1(::1) 56 data bytes 64 bytes from ::1: icmp_seq=1 ttl=64 time=0.037 ms --- ::1 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.037/0.037/0.037/0.000 ms 0.000 probe_libc:inet_pton:(7fa40ac618a0)) __GI___inet_pton (/usr/lib64/libc-2.26.so) getaddrinfo (/usr/lib64/libc-2.26.so) main (/usr/bin/ping) The kernel stack unwinder is used, it is specified implicitly as call-graph=fp (frame pointer). On s390x only dwarf is available for stack unwinding. It is also done in user space. This requires different parameter setup and result checking for s390x and Intel. This patch adds separate perf trace setup and result checking for Intel and s390x. On s390x specify this command line to get a call-graph and handle the different call graph result checking: [root@s35lp76 perf]# ./perf trace --no-syscalls -e probe_libc:inet_pton/call-graph=dwarf/ ping -6 -c 1 ::1 PING ::1(::1) 56 data bytes 64 bytes from ::1: icmp_seq=1 ttl=64 time=0.041 ms --- ::1 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.041/0.041/0.041/0.000 ms 0.000 probe_libc:inet_pton:(3ffb9942060)) __GI___inet_pton (/usr/lib64/libc-2.26.so) gaih_inet (inlined) __GI_getaddrinfo (inlined) main (/usr/bin/ping) __libc_start_main (/usr/lib64/libc-2.26.so) _start (/usr/bin/ping) [root@s35lp76 perf]# Before: [root@s8360047 perf]# ./perf test -vv 58 58: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 26349 PING ::1(::1) 56 data bytes 64 bytes from ::1: icmp_seq=1 ttl=64 time=0.079 ms --- ::1 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.079/0.079/0.079/0.000 ms 0.000 probe_libc:inet_pton:(3ff925c2060)) test child finished with -1 ---- end ---- probe libc's inet_pton & backtrace it with ping: FAILED! [root@s8360047 perf]# After: [root@s35lp76 perf]# ./perf test -vv 57 57: probe libc's inet_pton & backtrace it with ping : --- start --- test child forked, pid 38708 PING ::1(::1) 56 data bytes 64 bytes from ::1: icmp_seq=1 ttl=64 time=0.038 ms --- ::1 ping statistics --- 1 packets transmitted, 1 received, 0% packet loss, time 0ms rtt min/avg/max/mdev = 0.038/0.038/0.038/0.000 ms 0.000 probe_libc:inet_pton:(3ff87342060)) __GI___inet_pton (/usr/lib64/libc-2.26.so) gaih_inet (inlined) __GI_getaddrinfo (inlined) main (/usr/bin/ping) __libc_start_main (/usr/lib64/libc-2.26.so) _start (/usr/bin/ping) test child finished with 0 ---- end ---- probe libc's inet_pton & backtrace it with ping: Ok [root@s35lp76 perf]# On Intel the test case runs unchanged and succeeds. Signed-off-by: Thomas Richter Reviewed-by: Hendrik Brueckner Tested-by: Arnaldo Carvalho de Melo Cc: Heiko Carstens Cc: Martin Schwidefsky Link: http://lkml.kernel.org/r/20180117083831.101001-1-tmricht@linux.vnet.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- .../tests/shell/trace+probe_libc_inet_pton.sh | 21 +++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh b/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh index 8b3da21a08f1..c446c894b297 100755 --- a/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh +++ b/tools/perf/tests/shell/trace+probe_libc_inet_pton.sh @@ -22,10 +22,23 @@ trace_libc_inet_pton_backtrace() { expected[4]="rtt min.*" expected[5]="[0-9]+\.[0-9]+[[:space:]]+probe_libc:inet_pton:\([[:xdigit:]]+\)" expected[6]=".*inet_pton[[:space:]]\($libc\)$" - expected[7]="getaddrinfo[[:space:]]\($libc\)$" - expected[8]=".*\(.*/bin/ping.*\)$" + case "$(uname -m)" in + s390x) + eventattr='call-graph=dwarf' + expected[7]="gaih_inet[[:space:]]\(inlined\)$" + expected[8]="__GI_getaddrinfo[[:space:]]\(inlined\)$" + expected[9]="main[[:space:]]\(.*/bin/ping.*\)$" + expected[10]="__libc_start_main[[:space:]]\($libc\)$" + expected[11]="_start[[:space:]]\(.*/bin/ping.*\)$" + ;; + *) + eventattr='max-stack=3' + expected[7]="getaddrinfo[[:space:]]\($libc\)$" + expected[8]=".*\(.*/bin/ping.*\)$" + ;; + esac - perf trace --no-syscalls -e probe_libc:inet_pton/max-stack=3/ ping -6 -c 1 ::1 2>&1 | grep -v ^$ | while read line ; do + perf trace --no-syscalls -e probe_libc:inet_pton/$eventattr/ ping -6 -c 1 ::1 2>&1 | grep -v ^$ | while read line ; do echo $line echo "$line" | egrep -q "${expected[$idx]}" if [ $? -ne 0 ] ; then @@ -33,7 +46,7 @@ trace_libc_inet_pton_backtrace() { exit 1 fi let idx+=1 - [ $idx -eq 9 ] && break + [ -z "${expected[$idx]}" ] && break done } From f091f1d6a2b4840c9b631d6138f5354401347863 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 13 Feb 2018 12:54:58 +0100 Subject: [PATCH 119/149] tools/headers: Synchronize kernel ABI headers, v4.16-rc1 Sync the following tooling headers with the latest kernel version: tools/arch/powerpc/include/uapi/asm/kvm.h tools/arch/x86/include/asm/cpufeatures.h tools/include/uapi/drm/i915_drm.h tools/include/uapi/linux/if_link.h tools/include/uapi/linux/kvm.h All the changes are new ABI additions which don't impact their use in existing tooling. Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Namhyung Kim Cc: Jiri Olsa Cc: Stephen Rothwell Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- tools/arch/powerpc/include/uapi/asm/kvm.h | 2 + tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/include/uapi/drm/i915_drm.h | 77 +++++++++++++++++++ tools/include/uapi/linux/if_link.h | 1 + tools/include/uapi/linux/kvm.h | 90 +++++++++++++++++++++++ 5 files changed, 171 insertions(+) diff --git a/tools/arch/powerpc/include/uapi/asm/kvm.h b/tools/arch/powerpc/include/uapi/asm/kvm.h index 637b7263cb86..833ed9a16adf 100644 --- a/tools/arch/powerpc/include/uapi/asm/kvm.h +++ b/tools/arch/powerpc/include/uapi/asm/kvm.h @@ -632,6 +632,8 @@ struct kvm_ppc_cpu_char { #define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc) #define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd) +#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 1d9199e1c2ad..0dfe4d3f74e2 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -210,6 +210,7 @@ #define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ #define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* "" Fill RSB on context switches */ +#define X86_FEATURE_SEV ( 7*32+20) /* AMD Secure Encrypted Virtualization */ #define X86_FEATURE_USE_IBPB ( 7*32+21) /* "" Indirect Branch Prediction Barrier enabled */ diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index ac3c6503ca27..536ee4febd74 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -86,6 +86,62 @@ enum i915_mocs_table_index { I915_MOCS_CACHED, }; +/* + * Different engines serve different roles, and there may be more than one + * engine serving each role. enum drm_i915_gem_engine_class provides a + * classification of the role of the engine, which may be used when requesting + * operations to be performed on a certain subset of engines, or for providing + * information about that group. + */ +enum drm_i915_gem_engine_class { + I915_ENGINE_CLASS_RENDER = 0, + I915_ENGINE_CLASS_COPY = 1, + I915_ENGINE_CLASS_VIDEO = 2, + I915_ENGINE_CLASS_VIDEO_ENHANCE = 3, + + I915_ENGINE_CLASS_INVALID = -1 +}; + +/** + * DOC: perf_events exposed by i915 through /sys/bus/event_sources/drivers/i915 + * + */ + +enum drm_i915_pmu_engine_sample { + I915_SAMPLE_BUSY = 0, + I915_SAMPLE_WAIT = 1, + I915_SAMPLE_SEMA = 2 +}; + +#define I915_PMU_SAMPLE_BITS (4) +#define I915_PMU_SAMPLE_MASK (0xf) +#define I915_PMU_SAMPLE_INSTANCE_BITS (8) +#define I915_PMU_CLASS_SHIFT \ + (I915_PMU_SAMPLE_BITS + I915_PMU_SAMPLE_INSTANCE_BITS) + +#define __I915_PMU_ENGINE(class, instance, sample) \ + ((class) << I915_PMU_CLASS_SHIFT | \ + (instance) << I915_PMU_SAMPLE_BITS | \ + (sample)) + +#define I915_PMU_ENGINE_BUSY(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_BUSY) + +#define I915_PMU_ENGINE_WAIT(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_WAIT) + +#define I915_PMU_ENGINE_SEMA(class, instance) \ + __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA) + +#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) + +#define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0) +#define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) +#define I915_PMU_INTERRUPTS __I915_PMU_OTHER(2) +#define I915_PMU_RC6_RESIDENCY __I915_PMU_OTHER(3) + +#define I915_PMU_LAST I915_PMU_RC6_RESIDENCY + /* Each region is a minimum of 16k, and there are at most 255 of them. */ #define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use @@ -450,6 +506,27 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_HAS_EXEC_FENCE_ARRAY 49 +/* + * Query whether every context (both per-file default and user created) is + * isolated (insofar as HW supports). If this parameter is not true, then + * freshly created contexts may inherit values from an existing context, + * rather than default HW values. If true, it also ensures (insofar as HW + * supports) that all state set by this context will not leak to any other + * context. + * + * As not every engine across every gen support contexts, the returned + * value reports the support of context isolation for individual engines by + * returning a bitmask of each engine class set to true if that class supports + * isolation. + */ +#define I915_PARAM_HAS_CONTEXT_ISOLATION 50 + +/* Frequency of the command streamer timestamps given by the *_TIMESTAMP + * registers. This used to be fixed per platform but from CNL onwards, this + * might vary depending on the parts. + */ +#define I915_PARAM_CS_TIMESTAMP_FREQUENCY 51 + typedef struct drm_i915_getparam { __s32 param; /* diff --git a/tools/include/uapi/linux/if_link.h b/tools/include/uapi/linux/if_link.h index 8616131e2c61..6d9447700e18 100644 --- a/tools/include/uapi/linux/if_link.h +++ b/tools/include/uapi/linux/if_link.h @@ -163,6 +163,7 @@ enum { IFLA_IF_NETNSID, IFLA_CARRIER_UP_COUNT, IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, __IFLA_MAX }; diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 8fb90a0819c3..0fb5ef939732 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1362,6 +1362,96 @@ struct kvm_s390_ucas_mapping { /* Available with KVM_CAP_S390_CMMA_MIGRATION */ #define KVM_S390_GET_CMMA_BITS _IOWR(KVMIO, 0xb8, struct kvm_s390_cmma_log) #define KVM_S390_SET_CMMA_BITS _IOW(KVMIO, 0xb9, struct kvm_s390_cmma_log) +/* Memory Encryption Commands */ +#define KVM_MEMORY_ENCRYPT_OP _IOWR(KVMIO, 0xba, unsigned long) + +struct kvm_enc_region { + __u64 addr; + __u64 size; +}; + +#define KVM_MEMORY_ENCRYPT_REG_REGION _IOR(KVMIO, 0xbb, struct kvm_enc_region) +#define KVM_MEMORY_ENCRYPT_UNREG_REGION _IOR(KVMIO, 0xbc, struct kvm_enc_region) + +/* Secure Encrypted Virtualization command */ +enum sev_cmd_id { + /* Guest initialization commands */ + KVM_SEV_INIT = 0, + KVM_SEV_ES_INIT, + /* Guest launch commands */ + KVM_SEV_LAUNCH_START, + KVM_SEV_LAUNCH_UPDATE_DATA, + KVM_SEV_LAUNCH_UPDATE_VMSA, + KVM_SEV_LAUNCH_SECRET, + KVM_SEV_LAUNCH_MEASURE, + KVM_SEV_LAUNCH_FINISH, + /* Guest migration commands (outgoing) */ + KVM_SEV_SEND_START, + KVM_SEV_SEND_UPDATE_DATA, + KVM_SEV_SEND_UPDATE_VMSA, + KVM_SEV_SEND_FINISH, + /* Guest migration commands (incoming) */ + KVM_SEV_RECEIVE_START, + KVM_SEV_RECEIVE_UPDATE_DATA, + KVM_SEV_RECEIVE_UPDATE_VMSA, + KVM_SEV_RECEIVE_FINISH, + /* Guest status and debug commands */ + KVM_SEV_GUEST_STATUS, + KVM_SEV_DBG_DECRYPT, + KVM_SEV_DBG_ENCRYPT, + /* Guest certificates commands */ + KVM_SEV_CERT_EXPORT, + + KVM_SEV_NR_MAX, +}; + +struct kvm_sev_cmd { + __u32 id; + __u64 data; + __u32 error; + __u32 sev_fd; +}; + +struct kvm_sev_launch_start { + __u32 handle; + __u32 policy; + __u64 dh_uaddr; + __u32 dh_len; + __u64 session_uaddr; + __u32 session_len; +}; + +struct kvm_sev_launch_update_data { + __u64 uaddr; + __u32 len; +}; + + +struct kvm_sev_launch_secret { + __u64 hdr_uaddr; + __u32 hdr_len; + __u64 guest_uaddr; + __u32 guest_len; + __u64 trans_uaddr; + __u32 trans_len; +}; + +struct kvm_sev_launch_measure { + __u64 uaddr; + __u32 len; +}; + +struct kvm_sev_guest_status { + __u32 handle; + __u32 policy; + __u32 state; +}; + +struct kvm_sev_dbg { + __u64 src_uaddr; + __u64 dst_uaddr; + __u32 len; +}; #define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) #define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) From baa676103037e0dd145bb905eb51bc0b2f48fd49 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 8 Feb 2018 12:47:49 +0100 Subject: [PATCH 120/149] perf s390: Grab a copy of arch/s390/kernel/syscall/syscall.tbl Grab a copy of the s390 system call table file introduced with commit 857f46bfb07f53dc112d69bdfb137cc5ec3da7c5 "s390/syscalls: add system call table". Signed-off-by: Hendrik Brueckner Cc: Jiri Olsa Cc: Michael Petlan Cc: Thomas Richter Cc: linux-s390@vger.kernel.org LPU-Reference: 1518090470-2899-3-git-send-email-brueckner@linux.vnet.ibm.com Link: https://lkml.kernel.org/n/tip-hpw7vdjp7g92ivgpddrp5ydq@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- .../perf/arch/s390/entry/syscalls/syscall.tbl | 390 ++++++++++++++++++ 1 file changed, 390 insertions(+) create mode 100644 tools/perf/arch/s390/entry/syscalls/syscall.tbl diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl new file mode 100644 index 000000000000..b38d48464368 --- /dev/null +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -0,0 +1,390 @@ +# SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note +# +# System call table for s390 +# +# Format: +# +# +# +# where can be common, 64, or 32 + +1 common exit sys_exit sys_exit +2 common fork sys_fork sys_fork +3 common read sys_read compat_sys_s390_read +4 common write sys_write compat_sys_s390_write +5 common open sys_open compat_sys_open +6 common close sys_close sys_close +7 common restart_syscall sys_restart_syscall sys_restart_syscall +8 common creat sys_creat compat_sys_creat +9 common link sys_link compat_sys_link +10 common unlink sys_unlink compat_sys_unlink +11 common execve sys_execve compat_sys_execve +12 common chdir sys_chdir compat_sys_chdir +13 32 time - compat_sys_time +14 common mknod sys_mknod compat_sys_mknod +15 common chmod sys_chmod compat_sys_chmod +16 32 lchown - compat_sys_s390_lchown16 +19 common lseek sys_lseek compat_sys_lseek +20 common getpid sys_getpid sys_getpid +21 common mount sys_mount compat_sys_mount +22 common umount sys_oldumount compat_sys_oldumount +23 32 setuid - compat_sys_s390_setuid16 +24 32 getuid - compat_sys_s390_getuid16 +25 32 stime - compat_sys_stime +26 common ptrace sys_ptrace compat_sys_ptrace +27 common alarm sys_alarm sys_alarm +29 common pause sys_pause sys_pause +30 common utime sys_utime compat_sys_utime +33 common access sys_access compat_sys_access +34 common nice sys_nice sys_nice +36 common sync sys_sync sys_sync +37 common kill sys_kill sys_kill +38 common rename sys_rename compat_sys_rename +39 common mkdir sys_mkdir compat_sys_mkdir +40 common rmdir sys_rmdir compat_sys_rmdir +41 common dup sys_dup sys_dup +42 common pipe sys_pipe compat_sys_pipe +43 common times sys_times compat_sys_times +45 common brk sys_brk compat_sys_brk +46 32 setgid - compat_sys_s390_setgid16 +47 32 getgid - compat_sys_s390_getgid16 +48 common signal sys_signal compat_sys_signal +49 32 geteuid - compat_sys_s390_geteuid16 +50 32 getegid - compat_sys_s390_getegid16 +51 common acct sys_acct compat_sys_acct +52 common umount2 sys_umount compat_sys_umount +54 common ioctl sys_ioctl compat_sys_ioctl +55 common fcntl sys_fcntl compat_sys_fcntl +57 common setpgid sys_setpgid sys_setpgid +60 common umask sys_umask sys_umask +61 common chroot sys_chroot compat_sys_chroot +62 common ustat sys_ustat compat_sys_ustat +63 common dup2 sys_dup2 sys_dup2 +64 common getppid sys_getppid sys_getppid +65 common getpgrp sys_getpgrp sys_getpgrp +66 common setsid sys_setsid sys_setsid +67 common sigaction sys_sigaction compat_sys_sigaction +70 32 setreuid - compat_sys_s390_setreuid16 +71 32 setregid - compat_sys_s390_setregid16 +72 common sigsuspend sys_sigsuspend compat_sys_sigsuspend +73 common sigpending sys_sigpending compat_sys_sigpending +74 common sethostname sys_sethostname compat_sys_sethostname +75 common setrlimit sys_setrlimit compat_sys_setrlimit +76 32 getrlimit - compat_sys_old_getrlimit +77 common getrusage sys_getrusage compat_sys_getrusage +78 common gettimeofday sys_gettimeofday compat_sys_gettimeofday +79 common settimeofday sys_settimeofday compat_sys_settimeofday +80 32 getgroups - compat_sys_s390_getgroups16 +81 32 setgroups - compat_sys_s390_setgroups16 +83 common symlink sys_symlink compat_sys_symlink +85 common readlink sys_readlink compat_sys_readlink +86 common uselib sys_uselib compat_sys_uselib +87 common swapon sys_swapon compat_sys_swapon +88 common reboot sys_reboot compat_sys_reboot +89 common readdir - compat_sys_old_readdir +90 common mmap sys_old_mmap compat_sys_s390_old_mmap +91 common munmap sys_munmap compat_sys_munmap +92 common truncate sys_truncate compat_sys_truncate +93 common ftruncate sys_ftruncate compat_sys_ftruncate +94 common fchmod sys_fchmod sys_fchmod +95 32 fchown - compat_sys_s390_fchown16 +96 common getpriority sys_getpriority sys_getpriority +97 common setpriority sys_setpriority sys_setpriority +99 common statfs sys_statfs compat_sys_statfs +100 common fstatfs sys_fstatfs compat_sys_fstatfs +101 32 ioperm - - +102 common socketcall sys_socketcall compat_sys_socketcall +103 common syslog sys_syslog compat_sys_syslog +104 common setitimer sys_setitimer compat_sys_setitimer +105 common getitimer sys_getitimer compat_sys_getitimer +106 common stat sys_newstat compat_sys_newstat +107 common lstat sys_newlstat compat_sys_newlstat +108 common fstat sys_newfstat compat_sys_newfstat +110 common lookup_dcookie sys_lookup_dcookie compat_sys_lookup_dcookie +111 common vhangup sys_vhangup sys_vhangup +112 common idle - - +114 common wait4 sys_wait4 compat_sys_wait4 +115 common swapoff sys_swapoff compat_sys_swapoff +116 common sysinfo sys_sysinfo compat_sys_sysinfo +117 common ipc sys_s390_ipc compat_sys_s390_ipc +118 common fsync sys_fsync sys_fsync +119 common sigreturn sys_sigreturn compat_sys_sigreturn +120 common clone sys_clone compat_sys_clone +121 common setdomainname sys_setdomainname compat_sys_setdomainname +122 common uname sys_newuname compat_sys_newuname +124 common adjtimex sys_adjtimex compat_sys_adjtimex +125 common mprotect sys_mprotect compat_sys_mprotect +126 common sigprocmask sys_sigprocmask compat_sys_sigprocmask +127 common create_module - - +128 common init_module sys_init_module compat_sys_init_module +129 common delete_module sys_delete_module compat_sys_delete_module +130 common get_kernel_syms - - +131 common quotactl sys_quotactl compat_sys_quotactl +132 common getpgid sys_getpgid sys_getpgid +133 common fchdir sys_fchdir sys_fchdir +134 common bdflush sys_bdflush compat_sys_bdflush +135 common sysfs sys_sysfs compat_sys_sysfs +136 common personality sys_s390_personality sys_s390_personality +137 common afs_syscall - - +138 32 setfsuid - compat_sys_s390_setfsuid16 +139 32 setfsgid - compat_sys_s390_setfsgid16 +140 32 _llseek - compat_sys_llseek +141 common getdents sys_getdents compat_sys_getdents +142 32 _newselect - compat_sys_select +142 64 select sys_select - +143 common flock sys_flock sys_flock +144 common msync sys_msync compat_sys_msync +145 common readv sys_readv compat_sys_readv +146 common writev sys_writev compat_sys_writev +147 common getsid sys_getsid sys_getsid +148 common fdatasync sys_fdatasync sys_fdatasync +149 common _sysctl sys_sysctl compat_sys_sysctl +150 common mlock sys_mlock compat_sys_mlock +151 common munlock sys_munlock compat_sys_munlock +152 common mlockall sys_mlockall sys_mlockall +153 common munlockall sys_munlockall sys_munlockall +154 common sched_setparam sys_sched_setparam compat_sys_sched_setparam +155 common sched_getparam sys_sched_getparam compat_sys_sched_getparam +156 common sched_setscheduler sys_sched_setscheduler compat_sys_sched_setscheduler +157 common sched_getscheduler sys_sched_getscheduler sys_sched_getscheduler +158 common sched_yield sys_sched_yield sys_sched_yield +159 common sched_get_priority_max sys_sched_get_priority_max sys_sched_get_priority_max +160 common sched_get_priority_min sys_sched_get_priority_min sys_sched_get_priority_min +161 common sched_rr_get_interval sys_sched_rr_get_interval compat_sys_sched_rr_get_interval +162 common nanosleep sys_nanosleep compat_sys_nanosleep +163 common mremap sys_mremap compat_sys_mremap +164 32 setresuid - compat_sys_s390_setresuid16 +165 32 getresuid - compat_sys_s390_getresuid16 +167 common query_module - - +168 common poll sys_poll compat_sys_poll +169 common nfsservctl - - +170 32 setresgid - compat_sys_s390_setresgid16 +171 32 getresgid - compat_sys_s390_getresgid16 +172 common prctl sys_prctl compat_sys_prctl +173 common rt_sigreturn sys_rt_sigreturn compat_sys_rt_sigreturn +174 common rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction +175 common rt_sigprocmask sys_rt_sigprocmask compat_sys_rt_sigprocmask +176 common rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending +177 common rt_sigtimedwait sys_rt_sigtimedwait compat_sys_rt_sigtimedwait +178 common rt_sigqueueinfo sys_rt_sigqueueinfo compat_sys_rt_sigqueueinfo +179 common rt_sigsuspend sys_rt_sigsuspend compat_sys_rt_sigsuspend +180 common pread64 sys_pread64 compat_sys_s390_pread64 +181 common pwrite64 sys_pwrite64 compat_sys_s390_pwrite64 +182 32 chown - compat_sys_s390_chown16 +183 common getcwd sys_getcwd compat_sys_getcwd +184 common capget sys_capget compat_sys_capget +185 common capset sys_capset compat_sys_capset +186 common sigaltstack sys_sigaltstack compat_sys_sigaltstack +187 common sendfile sys_sendfile64 compat_sys_sendfile +188 common getpmsg - - +189 common putpmsg - - +190 common vfork sys_vfork sys_vfork +191 32 ugetrlimit - compat_sys_getrlimit +191 64 getrlimit sys_getrlimit - +192 32 mmap2 - compat_sys_s390_mmap2 +193 32 truncate64 - compat_sys_s390_truncate64 +194 32 ftruncate64 - compat_sys_s390_ftruncate64 +195 32 stat64 - compat_sys_s390_stat64 +196 32 lstat64 - compat_sys_s390_lstat64 +197 32 fstat64 - compat_sys_s390_fstat64 +198 32 lchown32 - compat_sys_lchown +198 64 lchown sys_lchown - +199 32 getuid32 - sys_getuid +199 64 getuid sys_getuid - +200 32 getgid32 - sys_getgid +200 64 getgid sys_getgid - +201 32 geteuid32 - sys_geteuid +201 64 geteuid sys_geteuid - +202 32 getegid32 - sys_getegid +202 64 getegid sys_getegid - +203 32 setreuid32 - sys_setreuid +203 64 setreuid sys_setreuid - +204 32 setregid32 - sys_setregid +204 64 setregid sys_setregid - +205 32 getgroups32 - compat_sys_getgroups +205 64 getgroups sys_getgroups - +206 32 setgroups32 - compat_sys_setgroups +206 64 setgroups sys_setgroups - +207 32 fchown32 - sys_fchown +207 64 fchown sys_fchown - +208 32 setresuid32 - sys_setresuid +208 64 setresuid sys_setresuid - +209 32 getresuid32 - compat_sys_getresuid +209 64 getresuid sys_getresuid - +210 32 setresgid32 - sys_setresgid +210 64 setresgid sys_setresgid - +211 32 getresgid32 - compat_sys_getresgid +211 64 getresgid sys_getresgid - +212 32 chown32 - compat_sys_chown +212 64 chown sys_chown - +213 32 setuid32 - sys_setuid +213 64 setuid sys_setuid - +214 32 setgid32 - sys_setgid +214 64 setgid sys_setgid - +215 32 setfsuid32 - sys_setfsuid +215 64 setfsuid sys_setfsuid - +216 32 setfsgid32 - sys_setfsgid +216 64 setfsgid sys_setfsgid - +217 common pivot_root sys_pivot_root compat_sys_pivot_root +218 common mincore sys_mincore compat_sys_mincore +219 common madvise sys_madvise compat_sys_madvise +220 common getdents64 sys_getdents64 compat_sys_getdents64 +221 32 fcntl64 - compat_sys_fcntl64 +222 common readahead sys_readahead compat_sys_s390_readahead +223 32 sendfile64 - compat_sys_sendfile64 +224 common setxattr sys_setxattr compat_sys_setxattr +225 common lsetxattr sys_lsetxattr compat_sys_lsetxattr +226 common fsetxattr sys_fsetxattr compat_sys_fsetxattr +227 common getxattr sys_getxattr compat_sys_getxattr +228 common lgetxattr sys_lgetxattr compat_sys_lgetxattr +229 common fgetxattr sys_fgetxattr compat_sys_fgetxattr +230 common listxattr sys_listxattr compat_sys_listxattr +231 common llistxattr sys_llistxattr compat_sys_llistxattr +232 common flistxattr sys_flistxattr compat_sys_flistxattr +233 common removexattr sys_removexattr compat_sys_removexattr +234 common lremovexattr sys_lremovexattr compat_sys_lremovexattr +235 common fremovexattr sys_fremovexattr compat_sys_fremovexattr +236 common gettid sys_gettid sys_gettid +237 common tkill sys_tkill sys_tkill +238 common futex sys_futex compat_sys_futex +239 common sched_setaffinity sys_sched_setaffinity compat_sys_sched_setaffinity +240 common sched_getaffinity sys_sched_getaffinity compat_sys_sched_getaffinity +241 common tgkill sys_tgkill sys_tgkill +243 common io_setup sys_io_setup compat_sys_io_setup +244 common io_destroy sys_io_destroy compat_sys_io_destroy +245 common io_getevents sys_io_getevents compat_sys_io_getevents +246 common io_submit sys_io_submit compat_sys_io_submit +247 common io_cancel sys_io_cancel compat_sys_io_cancel +248 common exit_group sys_exit_group sys_exit_group +249 common epoll_create sys_epoll_create sys_epoll_create +250 common epoll_ctl sys_epoll_ctl compat_sys_epoll_ctl +251 common epoll_wait sys_epoll_wait compat_sys_epoll_wait +252 common set_tid_address sys_set_tid_address compat_sys_set_tid_address +253 common fadvise64 sys_fadvise64_64 compat_sys_s390_fadvise64 +254 common timer_create sys_timer_create compat_sys_timer_create +255 common timer_settime sys_timer_settime compat_sys_timer_settime +256 common timer_gettime sys_timer_gettime compat_sys_timer_gettime +257 common timer_getoverrun sys_timer_getoverrun sys_timer_getoverrun +258 common timer_delete sys_timer_delete sys_timer_delete +259 common clock_settime sys_clock_settime compat_sys_clock_settime +260 common clock_gettime sys_clock_gettime compat_sys_clock_gettime +261 common clock_getres sys_clock_getres compat_sys_clock_getres +262 common clock_nanosleep sys_clock_nanosleep compat_sys_clock_nanosleep +264 32 fadvise64_64 - compat_sys_s390_fadvise64_64 +265 common statfs64 sys_statfs64 compat_sys_statfs64 +266 common fstatfs64 sys_fstatfs64 compat_sys_fstatfs64 +267 common remap_file_pages sys_remap_file_pages compat_sys_remap_file_pages +268 common mbind sys_mbind compat_sys_mbind +269 common get_mempolicy sys_get_mempolicy compat_sys_get_mempolicy +270 common set_mempolicy sys_set_mempolicy compat_sys_set_mempolicy +271 common mq_open sys_mq_open compat_sys_mq_open +272 common mq_unlink sys_mq_unlink compat_sys_mq_unlink +273 common mq_timedsend sys_mq_timedsend compat_sys_mq_timedsend +274 common mq_timedreceive sys_mq_timedreceive compat_sys_mq_timedreceive +275 common mq_notify sys_mq_notify compat_sys_mq_notify +276 common mq_getsetattr sys_mq_getsetattr compat_sys_mq_getsetattr +277 common kexec_load sys_kexec_load compat_sys_kexec_load +278 common add_key sys_add_key compat_sys_add_key +279 common request_key sys_request_key compat_sys_request_key +280 common keyctl sys_keyctl compat_sys_keyctl +281 common waitid sys_waitid compat_sys_waitid +282 common ioprio_set sys_ioprio_set sys_ioprio_set +283 common ioprio_get sys_ioprio_get sys_ioprio_get +284 common inotify_init sys_inotify_init sys_inotify_init +285 common inotify_add_watch sys_inotify_add_watch compat_sys_inotify_add_watch +286 common inotify_rm_watch sys_inotify_rm_watch sys_inotify_rm_watch +287 common migrate_pages sys_migrate_pages compat_sys_migrate_pages +288 common openat sys_openat compat_sys_openat +289 common mkdirat sys_mkdirat compat_sys_mkdirat +290 common mknodat sys_mknodat compat_sys_mknodat +291 common fchownat sys_fchownat compat_sys_fchownat +292 common futimesat sys_futimesat compat_sys_futimesat +293 32 fstatat64 - compat_sys_s390_fstatat64 +293 64 newfstatat sys_newfstatat - +294 common unlinkat sys_unlinkat compat_sys_unlinkat +295 common renameat sys_renameat compat_sys_renameat +296 common linkat sys_linkat compat_sys_linkat +297 common symlinkat sys_symlinkat compat_sys_symlinkat +298 common readlinkat sys_readlinkat compat_sys_readlinkat +299 common fchmodat sys_fchmodat compat_sys_fchmodat +300 common faccessat sys_faccessat compat_sys_faccessat +301 common pselect6 sys_pselect6 compat_sys_pselect6 +302 common ppoll sys_ppoll compat_sys_ppoll +303 common unshare sys_unshare compat_sys_unshare +304 common set_robust_list sys_set_robust_list compat_sys_set_robust_list +305 common get_robust_list sys_get_robust_list compat_sys_get_robust_list +306 common splice sys_splice compat_sys_splice +307 common sync_file_range sys_sync_file_range compat_sys_s390_sync_file_range +308 common tee sys_tee compat_sys_tee +309 common vmsplice sys_vmsplice compat_sys_vmsplice +310 common move_pages sys_move_pages compat_sys_move_pages +311 common getcpu sys_getcpu compat_sys_getcpu +312 common epoll_pwait sys_epoll_pwait compat_sys_epoll_pwait +313 common utimes sys_utimes compat_sys_utimes +314 common fallocate sys_fallocate compat_sys_s390_fallocate +315 common utimensat sys_utimensat compat_sys_utimensat +316 common signalfd sys_signalfd compat_sys_signalfd +317 common timerfd - - +318 common eventfd sys_eventfd sys_eventfd +319 common timerfd_create sys_timerfd_create sys_timerfd_create +320 common timerfd_settime sys_timerfd_settime compat_sys_timerfd_settime +321 common timerfd_gettime sys_timerfd_gettime compat_sys_timerfd_gettime +322 common signalfd4 sys_signalfd4 compat_sys_signalfd4 +323 common eventfd2 sys_eventfd2 sys_eventfd2 +324 common inotify_init1 sys_inotify_init1 sys_inotify_init1 +325 common pipe2 sys_pipe2 compat_sys_pipe2 +326 common dup3 sys_dup3 sys_dup3 +327 common epoll_create1 sys_epoll_create1 sys_epoll_create1 +328 common preadv sys_preadv compat_sys_preadv +329 common pwritev sys_pwritev compat_sys_pwritev +330 common rt_tgsigqueueinfo sys_rt_tgsigqueueinfo compat_sys_rt_tgsigqueueinfo +331 common perf_event_open sys_perf_event_open compat_sys_perf_event_open +332 common fanotify_init sys_fanotify_init sys_fanotify_init +333 common fanotify_mark sys_fanotify_mark compat_sys_fanotify_mark +334 common prlimit64 sys_prlimit64 compat_sys_prlimit64 +335 common name_to_handle_at sys_name_to_handle_at compat_sys_name_to_handle_at +336 common open_by_handle_at sys_open_by_handle_at compat_sys_open_by_handle_at +337 common clock_adjtime sys_clock_adjtime compat_sys_clock_adjtime +338 common syncfs sys_syncfs sys_syncfs +339 common setns sys_setns sys_setns +340 common process_vm_readv sys_process_vm_readv compat_sys_process_vm_readv +341 common process_vm_writev sys_process_vm_writev compat_sys_process_vm_writev +342 common s390_runtime_instr sys_s390_runtime_instr sys_s390_runtime_instr +343 common kcmp sys_kcmp compat_sys_kcmp +344 common finit_module sys_finit_module compat_sys_finit_module +345 common sched_setattr sys_sched_setattr compat_sys_sched_setattr +346 common sched_getattr sys_sched_getattr compat_sys_sched_getattr +347 common renameat2 sys_renameat2 compat_sys_renameat2 +348 common seccomp sys_seccomp compat_sys_seccomp +349 common getrandom sys_getrandom compat_sys_getrandom +350 common memfd_create sys_memfd_create compat_sys_memfd_create +351 common bpf sys_bpf compat_sys_bpf +352 common s390_pci_mmio_write sys_s390_pci_mmio_write compat_sys_s390_pci_mmio_write +353 common s390_pci_mmio_read sys_s390_pci_mmio_read compat_sys_s390_pci_mmio_read +354 common execveat sys_execveat compat_sys_execveat +355 common userfaultfd sys_userfaultfd sys_userfaultfd +356 common membarrier sys_membarrier sys_membarrier +357 common recvmmsg sys_recvmmsg compat_sys_recvmmsg +358 common sendmmsg sys_sendmmsg compat_sys_sendmmsg +359 common socket sys_socket sys_socket +360 common socketpair sys_socketpair compat_sys_socketpair +361 common bind sys_bind compat_sys_bind +362 common connect sys_connect compat_sys_connect +363 common listen sys_listen sys_listen +364 common accept4 sys_accept4 compat_sys_accept4 +365 common getsockopt sys_getsockopt compat_sys_getsockopt +366 common setsockopt sys_setsockopt compat_sys_setsockopt +367 common getsockname sys_getsockname compat_sys_getsockname +368 common getpeername sys_getpeername compat_sys_getpeername +369 common sendto sys_sendto compat_sys_sendto +370 common sendmsg sys_sendmsg compat_sys_sendmsg +371 common recvfrom sys_recvfrom compat_sys_recvfrom +372 common recvmsg sys_recvmsg compat_sys_recvmsg +373 common shutdown sys_shutdown sys_shutdown +374 common mlock2 sys_mlock2 compat_sys_mlock2 +375 common copy_file_range sys_copy_file_range compat_sys_copy_file_range +376 common preadv2 sys_preadv2 compat_sys_preadv2 +377 common pwritev2 sys_pwritev2 compat_sys_pwritev2 +378 common s390_guarded_storage sys_s390_guarded_storage compat_sys_s390_guarded_storage +379 common statx sys_statx compat_sys_statx +380 common s390_sthyi sys_s390_sthyi compat_sys_s390_sthyi From 690d22d9d4423b4522fb44a71145403eef2df834 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 8 Feb 2018 12:47:50 +0100 Subject: [PATCH 121/149] perf s390: Rework system call table creation by using syscall.tbl Recently, s390 uses a syscall.tbl input file to generate its system call table and unistd uapi header files. Hence, update mksyscalltbl to use it as input to create the system table for perf. Signed-off-by: Hendrik Brueckner Cc: Jiri Olsa Cc: Michael Petlan Cc: Thomas Richter Cc: linux-s390@vger.kernel.org LPU-Reference: 1518090470-2899-4-git-send-email-brueckner@linux.vnet.ibm.com Link: https://lkml.kernel.org/n/tip-bdyhllhsq1zgxv2qx4m377y6@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/arch/s390/Makefile | 10 +++++++--- .../perf/arch/s390/entry/syscalls/mksyscalltbl | 18 +++++++----------- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/tools/perf/arch/s390/Makefile b/tools/perf/arch/s390/Makefile index 48228de415d0..dfa6e3103437 100644 --- a/tools/perf/arch/s390/Makefile +++ b/tools/perf/arch/s390/Makefile @@ -10,15 +10,19 @@ PERF_HAVE_ARCH_REGS_QUERY_REGISTER_OFFSET := 1 out := $(OUTPUT)arch/s390/include/generated/asm header := $(out)/syscalls_64.c -sysdef := $(srctree)/tools/arch/s390/include/uapi/asm/unistd.h -sysprf := $(srctree)/tools/perf/arch/s390/entry/syscalls/ +syskrn := $(srctree)/arch/s390/kernel/syscalls/syscall.tbl +sysprf := $(srctree)/tools/perf/arch/s390/entry/syscalls +sysdef := $(sysprf)/syscall.tbl systbl := $(sysprf)/mksyscalltbl # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') $(header): $(sysdef) $(systbl) - $(Q)$(SHELL) '$(systbl)' '$(CC)' $(sysdef) > $@ + @(test -d ../../kernel -a -d ../../tools -a -d ../perf && ( \ + (diff -B $(sysdef) $(syskrn) >/dev/null) \ + || echo "Warning: Kernel ABI header at '$(sysdef)' differs from latest version at '$(syskrn)'" >&2 )) || true + $(Q)$(SHELL) '$(systbl)' $(sysdef) > $@ clean:: $(call QUIET_CLEAN, s390) $(RM) $(header) diff --git a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl index 7fa0d0abd419..72ecbb676370 100755 --- a/tools/perf/arch/s390/entry/syscalls/mksyscalltbl +++ b/tools/perf/arch/s390/entry/syscalls/mksyscalltbl @@ -3,25 +3,23 @@ # # Generate system call table for perf # -# -# Copyright IBM Corp. 2017 +# Copyright IBM Corp. 2017, 2018 # Author(s): Hendrik Brueckner # -gcc=$1 -input=$2 +SYSCALL_TBL=$1 -if ! test -r $input; then +if ! test -r $SYSCALL_TBL; then echo "Could not read input file" >&2 exit 1 fi create_table() { - local max_nr + local max_nr nr abi sc discard echo 'static const char *syscalltbl_s390_64[] = {' - while read sc nr; do + while read nr abi sc discard; do printf '\t[%d] = "%s",\n' $nr $sc max_nr=$nr done @@ -29,8 +27,6 @@ create_table() echo "#define SYSCALLTBL_S390_64_MAX_ID $max_nr" } - -$gcc -m64 -E -dM -x c $input \ - |sed -ne 's/^#define __NR_//p' \ - |sort -t' ' -k2 -nu \ +grep -E "^[[:digit:]]+[[:space:]]+(common|64)" $SYSCALL_TBL \ + |sort -k1 -n \ |create_table From f1d0b4cde922863004ce3f5f39e8662cc0686c96 Mon Sep 17 00:00:00 2001 From: Hendrik Brueckner Date: Thu, 8 Feb 2018 12:47:48 +0100 Subject: [PATCH 122/149] Revert "tools include s390: Grab a copy of arch/s390/include/uapi/asm/unistd.h" This reverts commit f120c7b187e6c418238710b48723ce141f467543 which is no longer required with the introduction of a syscall.tbl on s390. Signed-off-by: Hendrik Brueckner Cc: Jiri Olsa Cc: Michael Petlan Cc: Thomas Richter Cc: linux-s390@vger.kernel.org LPU-Reference: 1518090470-2899-2-git-send-email-brueckner@linux.vnet.ibm.com Link: https://lkml.kernel.org/n/tip-q1lg0nvhha1tk39ri9aqalcb@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/s390/include/uapi/asm/unistd.h | 412 ---------------------- tools/perf/check-headers.sh | 1 - 2 files changed, 413 deletions(-) delete mode 100644 tools/arch/s390/include/uapi/asm/unistd.h diff --git a/tools/arch/s390/include/uapi/asm/unistd.h b/tools/arch/s390/include/uapi/asm/unistd.h deleted file mode 100644 index 725120939051..000000000000 --- a/tools/arch/s390/include/uapi/asm/unistd.h +++ /dev/null @@ -1,412 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * S390 version - * - * Derived from "include/asm-i386/unistd.h" - */ - -#ifndef _UAPI_ASM_S390_UNISTD_H_ -#define _UAPI_ASM_S390_UNISTD_H_ - -/* - * This file contains the system call numbers. - */ - -#define __NR_exit 1 -#define __NR_fork 2 -#define __NR_read 3 -#define __NR_write 4 -#define __NR_open 5 -#define __NR_close 6 -#define __NR_restart_syscall 7 -#define __NR_creat 8 -#define __NR_link 9 -#define __NR_unlink 10 -#define __NR_execve 11 -#define __NR_chdir 12 -#define __NR_mknod 14 -#define __NR_chmod 15 -#define __NR_lseek 19 -#define __NR_getpid 20 -#define __NR_mount 21 -#define __NR_umount 22 -#define __NR_ptrace 26 -#define __NR_alarm 27 -#define __NR_pause 29 -#define __NR_utime 30 -#define __NR_access 33 -#define __NR_nice 34 -#define __NR_sync 36 -#define __NR_kill 37 -#define __NR_rename 38 -#define __NR_mkdir 39 -#define __NR_rmdir 40 -#define __NR_dup 41 -#define __NR_pipe 42 -#define __NR_times 43 -#define __NR_brk 45 -#define __NR_signal 48 -#define __NR_acct 51 -#define __NR_umount2 52 -#define __NR_ioctl 54 -#define __NR_fcntl 55 -#define __NR_setpgid 57 -#define __NR_umask 60 -#define __NR_chroot 61 -#define __NR_ustat 62 -#define __NR_dup2 63 -#define __NR_getppid 64 -#define __NR_getpgrp 65 -#define __NR_setsid 66 -#define __NR_sigaction 67 -#define __NR_sigsuspend 72 -#define __NR_sigpending 73 -#define __NR_sethostname 74 -#define __NR_setrlimit 75 -#define __NR_getrusage 77 -#define __NR_gettimeofday 78 -#define __NR_settimeofday 79 -#define __NR_symlink 83 -#define __NR_readlink 85 -#define __NR_uselib 86 -#define __NR_swapon 87 -#define __NR_reboot 88 -#define __NR_readdir 89 -#define __NR_mmap 90 -#define __NR_munmap 91 -#define __NR_truncate 92 -#define __NR_ftruncate 93 -#define __NR_fchmod 94 -#define __NR_getpriority 96 -#define __NR_setpriority 97 -#define __NR_statfs 99 -#define __NR_fstatfs 100 -#define __NR_socketcall 102 -#define __NR_syslog 103 -#define __NR_setitimer 104 -#define __NR_getitimer 105 -#define __NR_stat 106 -#define __NR_lstat 107 -#define __NR_fstat 108 -#define __NR_lookup_dcookie 110 -#define __NR_vhangup 111 -#define __NR_idle 112 -#define __NR_wait4 114 -#define __NR_swapoff 115 -#define __NR_sysinfo 116 -#define __NR_ipc 117 -#define __NR_fsync 118 -#define __NR_sigreturn 119 -#define __NR_clone 120 -#define __NR_setdomainname 121 -#define __NR_uname 122 -#define __NR_adjtimex 124 -#define __NR_mprotect 125 -#define __NR_sigprocmask 126 -#define __NR_create_module 127 -#define __NR_init_module 128 -#define __NR_delete_module 129 -#define __NR_get_kernel_syms 130 -#define __NR_quotactl 131 -#define __NR_getpgid 132 -#define __NR_fchdir 133 -#define __NR_bdflush 134 -#define __NR_sysfs 135 -#define __NR_personality 136 -#define __NR_afs_syscall 137 /* Syscall for Andrew File System */ -#define __NR_getdents 141 -#define __NR_flock 143 -#define __NR_msync 144 -#define __NR_readv 145 -#define __NR_writev 146 -#define __NR_getsid 147 -#define __NR_fdatasync 148 -#define __NR__sysctl 149 -#define __NR_mlock 150 -#define __NR_munlock 151 -#define __NR_mlockall 152 -#define __NR_munlockall 153 -#define __NR_sched_setparam 154 -#define __NR_sched_getparam 155 -#define __NR_sched_setscheduler 156 -#define __NR_sched_getscheduler 157 -#define __NR_sched_yield 158 -#define __NR_sched_get_priority_max 159 -#define __NR_sched_get_priority_min 160 -#define __NR_sched_rr_get_interval 161 -#define __NR_nanosleep 162 -#define __NR_mremap 163 -#define __NR_query_module 167 -#define __NR_poll 168 -#define __NR_nfsservctl 169 -#define __NR_prctl 172 -#define __NR_rt_sigreturn 173 -#define __NR_rt_sigaction 174 -#define __NR_rt_sigprocmask 175 -#define __NR_rt_sigpending 176 -#define __NR_rt_sigtimedwait 177 -#define __NR_rt_sigqueueinfo 178 -#define __NR_rt_sigsuspend 179 -#define __NR_pread64 180 -#define __NR_pwrite64 181 -#define __NR_getcwd 183 -#define __NR_capget 184 -#define __NR_capset 185 -#define __NR_sigaltstack 186 -#define __NR_sendfile 187 -#define __NR_getpmsg 188 -#define __NR_putpmsg 189 -#define __NR_vfork 190 -#define __NR_pivot_root 217 -#define __NR_mincore 218 -#define __NR_madvise 219 -#define __NR_getdents64 220 -#define __NR_readahead 222 -#define __NR_setxattr 224 -#define __NR_lsetxattr 225 -#define __NR_fsetxattr 226 -#define __NR_getxattr 227 -#define __NR_lgetxattr 228 -#define __NR_fgetxattr 229 -#define __NR_listxattr 230 -#define __NR_llistxattr 231 -#define __NR_flistxattr 232 -#define __NR_removexattr 233 -#define __NR_lremovexattr 234 -#define __NR_fremovexattr 235 -#define __NR_gettid 236 -#define __NR_tkill 237 -#define __NR_futex 238 -#define __NR_sched_setaffinity 239 -#define __NR_sched_getaffinity 240 -#define __NR_tgkill 241 -/* Number 242 is reserved for tux */ -#define __NR_io_setup 243 -#define __NR_io_destroy 244 -#define __NR_io_getevents 245 -#define __NR_io_submit 246 -#define __NR_io_cancel 247 -#define __NR_exit_group 248 -#define __NR_epoll_create 249 -#define __NR_epoll_ctl 250 -#define __NR_epoll_wait 251 -#define __NR_set_tid_address 252 -#define __NR_fadvise64 253 -#define __NR_timer_create 254 -#define __NR_timer_settime 255 -#define __NR_timer_gettime 256 -#define __NR_timer_getoverrun 257 -#define __NR_timer_delete 258 -#define __NR_clock_settime 259 -#define __NR_clock_gettime 260 -#define __NR_clock_getres 261 -#define __NR_clock_nanosleep 262 -/* Number 263 is reserved for vserver */ -#define __NR_statfs64 265 -#define __NR_fstatfs64 266 -#define __NR_remap_file_pages 267 -#define __NR_mbind 268 -#define __NR_get_mempolicy 269 -#define __NR_set_mempolicy 270 -#define __NR_mq_open 271 -#define __NR_mq_unlink 272 -#define __NR_mq_timedsend 273 -#define __NR_mq_timedreceive 274 -#define __NR_mq_notify 275 -#define __NR_mq_getsetattr 276 -#define __NR_kexec_load 277 -#define __NR_add_key 278 -#define __NR_request_key 279 -#define __NR_keyctl 280 -#define __NR_waitid 281 -#define __NR_ioprio_set 282 -#define __NR_ioprio_get 283 -#define __NR_inotify_init 284 -#define __NR_inotify_add_watch 285 -#define __NR_inotify_rm_watch 286 -#define __NR_migrate_pages 287 -#define __NR_openat 288 -#define __NR_mkdirat 289 -#define __NR_mknodat 290 -#define __NR_fchownat 291 -#define __NR_futimesat 292 -#define __NR_unlinkat 294 -#define __NR_renameat 295 -#define __NR_linkat 296 -#define __NR_symlinkat 297 -#define __NR_readlinkat 298 -#define __NR_fchmodat 299 -#define __NR_faccessat 300 -#define __NR_pselect6 301 -#define __NR_ppoll 302 -#define __NR_unshare 303 -#define __NR_set_robust_list 304 -#define __NR_get_robust_list 305 -#define __NR_splice 306 -#define __NR_sync_file_range 307 -#define __NR_tee 308 -#define __NR_vmsplice 309 -#define __NR_move_pages 310 -#define __NR_getcpu 311 -#define __NR_epoll_pwait 312 -#define __NR_utimes 313 -#define __NR_fallocate 314 -#define __NR_utimensat 315 -#define __NR_signalfd 316 -#define __NR_timerfd 317 -#define __NR_eventfd 318 -#define __NR_timerfd_create 319 -#define __NR_timerfd_settime 320 -#define __NR_timerfd_gettime 321 -#define __NR_signalfd4 322 -#define __NR_eventfd2 323 -#define __NR_inotify_init1 324 -#define __NR_pipe2 325 -#define __NR_dup3 326 -#define __NR_epoll_create1 327 -#define __NR_preadv 328 -#define __NR_pwritev 329 -#define __NR_rt_tgsigqueueinfo 330 -#define __NR_perf_event_open 331 -#define __NR_fanotify_init 332 -#define __NR_fanotify_mark 333 -#define __NR_prlimit64 334 -#define __NR_name_to_handle_at 335 -#define __NR_open_by_handle_at 336 -#define __NR_clock_adjtime 337 -#define __NR_syncfs 338 -#define __NR_setns 339 -#define __NR_process_vm_readv 340 -#define __NR_process_vm_writev 341 -#define __NR_s390_runtime_instr 342 -#define __NR_kcmp 343 -#define __NR_finit_module 344 -#define __NR_sched_setattr 345 -#define __NR_sched_getattr 346 -#define __NR_renameat2 347 -#define __NR_seccomp 348 -#define __NR_getrandom 349 -#define __NR_memfd_create 350 -#define __NR_bpf 351 -#define __NR_s390_pci_mmio_write 352 -#define __NR_s390_pci_mmio_read 353 -#define __NR_execveat 354 -#define __NR_userfaultfd 355 -#define __NR_membarrier 356 -#define __NR_recvmmsg 357 -#define __NR_sendmmsg 358 -#define __NR_socket 359 -#define __NR_socketpair 360 -#define __NR_bind 361 -#define __NR_connect 362 -#define __NR_listen 363 -#define __NR_accept4 364 -#define __NR_getsockopt 365 -#define __NR_setsockopt 366 -#define __NR_getsockname 367 -#define __NR_getpeername 368 -#define __NR_sendto 369 -#define __NR_sendmsg 370 -#define __NR_recvfrom 371 -#define __NR_recvmsg 372 -#define __NR_shutdown 373 -#define __NR_mlock2 374 -#define __NR_copy_file_range 375 -#define __NR_preadv2 376 -#define __NR_pwritev2 377 -#define __NR_s390_guarded_storage 378 -#define __NR_statx 379 -#define __NR_s390_sthyi 380 -#define NR_syscalls 381 - -/* - * There are some system calls that are not present on 64 bit, some - * have a different name although they do the same (e.g. __NR_chown32 - * is __NR_chown on 64 bit). - */ -#ifndef __s390x__ - -#define __NR_time 13 -#define __NR_lchown 16 -#define __NR_setuid 23 -#define __NR_getuid 24 -#define __NR_stime 25 -#define __NR_setgid 46 -#define __NR_getgid 47 -#define __NR_geteuid 49 -#define __NR_getegid 50 -#define __NR_setreuid 70 -#define __NR_setregid 71 -#define __NR_getrlimit 76 -#define __NR_getgroups 80 -#define __NR_setgroups 81 -#define __NR_fchown 95 -#define __NR_ioperm 101 -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#define __NR__llseek 140 -#define __NR__newselect 142 -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_setresgid 170 -#define __NR_getresgid 171 -#define __NR_chown 182 -#define __NR_ugetrlimit 191 /* SuS compliant getrlimit */ -#define __NR_mmap2 192 -#define __NR_truncate64 193 -#define __NR_ftruncate64 194 -#define __NR_stat64 195 -#define __NR_lstat64 196 -#define __NR_fstat64 197 -#define __NR_lchown32 198 -#define __NR_getuid32 199 -#define __NR_getgid32 200 -#define __NR_geteuid32 201 -#define __NR_getegid32 202 -#define __NR_setreuid32 203 -#define __NR_setregid32 204 -#define __NR_getgroups32 205 -#define __NR_setgroups32 206 -#define __NR_fchown32 207 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#define __NR_chown32 212 -#define __NR_setuid32 213 -#define __NR_setgid32 214 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 -#define __NR_fcntl64 221 -#define __NR_sendfile64 223 -#define __NR_fadvise64_64 264 -#define __NR_fstatat64 293 - -#else - -#define __NR_select 142 -#define __NR_getrlimit 191 /* SuS compliant getrlimit */ -#define __NR_lchown 198 -#define __NR_getuid 199 -#define __NR_getgid 200 -#define __NR_geteuid 201 -#define __NR_getegid 202 -#define __NR_setreuid 203 -#define __NR_setregid 204 -#define __NR_getgroups 205 -#define __NR_setgroups 206 -#define __NR_fchown 207 -#define __NR_setresuid 208 -#define __NR_getresuid 209 -#define __NR_setresgid 210 -#define __NR_getresgid 211 -#define __NR_chown 212 -#define __NR_setuid 213 -#define __NR_setgid 214 -#define __NR_setfsuid 215 -#define __NR_setfsgid 216 -#define __NR_newfstatat 293 - -#endif - -#endif /* _UAPI_ASM_S390_UNISTD_H_ */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 51abdb0a4047..790ec25919a0 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -33,7 +33,6 @@ arch/s390/include/uapi/asm/kvm.h arch/s390/include/uapi/asm/kvm_perf.h arch/s390/include/uapi/asm/ptrace.h arch/s390/include/uapi/asm/sie.h -arch/s390/include/uapi/asm/unistd.h arch/arm/include/uapi/asm/kvm.h arch/arm64/include/uapi/asm/kvm.h arch/alpha/include/uapi/asm/errno.h From 096392e0714d3a520366ba467e215edf7280acff Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Thu, 15 Feb 2018 23:53:17 +0900 Subject: [PATCH 123/149] block: fix a typo in comment of BLK_MQ_POLL_STATS_BKTS Update comment typo _consisitent_ to _consistent_ from following commit. commit 0206319fdfee ("blk-mq: Fix poll_stat for new size-based bucketing.") Cc: Jens Axboe Signed-off-by: Minwoo Im Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4f3df807cf8f..ed63f3b69c12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -49,7 +49,7 @@ struct blk_stat_callback; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ -/* Must be consisitent with blk_mq_poll_stats_bkt() */ +/* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 /* From ec897569ad7dbc6d595873a487c3fac23f463f76 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 31 Jan 2018 22:24:45 +0000 Subject: [PATCH 124/149] usb: Move USB_UHCI_BIG_ENDIAN_* out of USB_SUPPORT Move the Kconfig symbols USB_UHCI_BIG_ENDIAN_MMIO and USB_UHCI_BIG_ENDIAN_DESC out of drivers/usb/host/Kconfig, which is conditional upon USB && USB_SUPPORT, so that it can be freely selected by platform Kconfig symbols in architecture code. For example once the MIPS_GENERIC platform selects are fixed in commit 2e6522c56552 ("MIPS: Fix typo BIG_ENDIAN to CPU_BIG_ENDIAN"), the MIPS 32r6_defconfig warns like so: warning: (MIPS_GENERIC) selects USB_UHCI_BIG_ENDIAN_MMIO which has unmet direct dependencies (USB_SUPPORT && USB) warning: (MIPS_GENERIC) selects USB_UHCI_BIG_ENDIAN_DESC which has unmet direct dependencies (USB_SUPPORT && USB) Fixes: 2e6522c56552 ("MIPS: Fix typo BIG_ENDIAN to CPU_BIG_ENDIAN") Signed-off-by: James Hogan Cc: Corentin Labbe Cc: Ralf Baechle Cc: Paul Burton Cc: linux-usb@vger.kernel.org Cc: linux-mips@linux-mips.org Acked-by: Greg Kroah-Hartman Patchwork: https://patchwork.linux-mips.org/patch/18559/ --- drivers/usb/Kconfig | 8 ++++++++ drivers/usb/host/Kconfig | 8 -------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig index f699abab1787..65812a2f60b4 100644 --- a/drivers/usb/Kconfig +++ b/drivers/usb/Kconfig @@ -19,6 +19,14 @@ config USB_EHCI_BIG_ENDIAN_MMIO config USB_EHCI_BIG_ENDIAN_DESC bool +config USB_UHCI_BIG_ENDIAN_MMIO + bool + default y if SPARC_LEON + +config USB_UHCI_BIG_ENDIAN_DESC + bool + default y if SPARC_LEON + menuconfig USB_SUPPORT bool "USB support" depends on HAS_IOMEM diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index 6150bed7cfa8..4fcfb3084b36 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -633,14 +633,6 @@ config USB_UHCI_ASPEED bool default y if ARCH_ASPEED -config USB_UHCI_BIG_ENDIAN_MMIO - bool - default y if SPARC_LEON - -config USB_UHCI_BIG_ENDIAN_DESC - bool - default y if SPARC_LEON - config USB_FHCI_HCD tristate "Freescale QE USB Host Controller support" depends on OF_GPIO && QE_GPIO && QUICC_ENGINE From 5efad9eee33ee5fc4bf3059f74f3932a638534d1 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 31 Jan 2018 22:24:46 +0000 Subject: [PATCH 125/149] sparc,leon: Select USB_UHCI_BIG_ENDIAN_{MMIO,DESC} Now that USB_UHCI_BIG_ENDIAN_MMIO and USB_UHCI_BIG_ENDIAN_DESC are moved outside of the USB_SUPPORT conditional, simply select them from SPARC_LEON rather than by the symbol's defaults in drivers/usb/Kconfig, similar to how it is done for USB_EHCI_BIG_ENDIAN_MMIO and USB_EHCI_BIG_ENDIAN_DESC. Signed-off-by: James Hogan Cc: "David S. Miller" Cc: Greg Kroah-Hartman Cc: Corentin Labbe Cc: sparclinux@vger.kernel.org Cc: linux-usb@vger.kernel.org Acked-by: David S. Miller Patchwork: https://patchwork.linux-mips.org/patch/18560/ --- arch/sparc/Kconfig | 2 ++ drivers/usb/Kconfig | 2 -- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 6bf594ace663..8767e45f1b2b 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -430,6 +430,8 @@ config SPARC_LEON depends on SPARC32 select USB_EHCI_BIG_ENDIAN_MMIO select USB_EHCI_BIG_ENDIAN_DESC + select USB_UHCI_BIG_ENDIAN_MMIO + select USB_UHCI_BIG_ENDIAN_DESC ---help--- If you say Y here if you are running on a SPARC-LEON processor. The LEON processor is a synthesizable VHDL model of the diff --git a/drivers/usb/Kconfig b/drivers/usb/Kconfig index 65812a2f60b4..148f3ee70286 100644 --- a/drivers/usb/Kconfig +++ b/drivers/usb/Kconfig @@ -21,11 +21,9 @@ config USB_EHCI_BIG_ENDIAN_DESC config USB_UHCI_BIG_ENDIAN_MMIO bool - default y if SPARC_LEON config USB_UHCI_BIG_ENDIAN_DESC bool - default y if SPARC_LEON menuconfig USB_SUPPORT bool "USB support" From 92256269893e96e5f9e8ac6dd882a0bef63fcea7 Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 7 Feb 2018 18:40:27 +0100 Subject: [PATCH 126/149] drm/nouveau: Make clock gate support conditional The recently introduced clock gate support breaks on Tegra chips because no thermal support is enabled for those devices. Conditionalize the code on the existence of thermal support to fix this. Fixes: b138eca661cc ("drm/nouveau: Add support for basic clockgating on Kepler1") Cc: Martin Peres Cc: Lyude Paul Signed-off-by: Thierry Reding Reviewed-by: Lyude Paul Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c index bf62303571b3..3695cde669f8 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/therm/base.c @@ -301,7 +301,7 @@ nvkm_therm_attr_set(struct nvkm_therm *therm, void nvkm_therm_clkgate_enable(struct nvkm_therm *therm) { - if (!therm->func->clkgate_enable || !therm->clkgating_enabled) + if (!therm || !therm->func->clkgate_enable || !therm->clkgating_enabled) return; nvkm_debug(&therm->subdev, @@ -312,7 +312,7 @@ nvkm_therm_clkgate_enable(struct nvkm_therm *therm) void nvkm_therm_clkgate_fini(struct nvkm_therm *therm, bool suspend) { - if (!therm->func->clkgate_fini || !therm->clkgating_enabled) + if (!therm || !therm->func->clkgate_fini || !therm->clkgating_enabled) return; nvkm_debug(&therm->subdev, @@ -395,7 +395,7 @@ void nvkm_therm_clkgate_init(struct nvkm_therm *therm, const struct nvkm_therm_clkgate_pack *p) { - if (!therm->func->clkgate_init || !therm->clkgating_enabled) + if (!therm || !therm->func->clkgate_init || !therm->clkgating_enabled) return; therm->func->clkgate_init(therm, p); From 12310e3437554328bcd75186cf331bc712cb30b2 Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 10 Jan 2018 00:51:23 +0100 Subject: [PATCH 127/149] kprobes: Propagate error from arm_kprobe_ftrace() Improve error handling when arming ftrace-based kprobes. Specifically, if we fail to arm a ftrace-based kprobe, register_kprobe()/enable_kprobe() should report an error instead of success. Previously, this has lead to confusing situations where register_kprobe() would return 0 indicating success, but the kprobe would not be functional if ftrace registration during the kprobe arming process had failed. We should therefore take any errors returned by ftrace into account and propagate this error so that we do not register/enable kprobes that cannot be armed. This can happen if, for example, register_ftrace_function() finds an IPMODIFY conflict (since kprobe_ftrace_ops has this flag set) and returns an error. Such a conflict is possible since livepatches also set the IPMODIFY flag for their ftrace_ops. arm_all_kprobes() keeps its current behavior and attempts to arm all kprobes. It returns the last encountered error and gives a warning if not all probes could be armed. This patch is based on Petr Mladek's original patchset (patches 2 and 3) back in 2015, which improved kprobes error handling, found here: https://lkml.org/lkml/2015/2/26/452 However, further work on this had been paused since then and the patches were not upstreamed. Based-on-patches-by: Petr Mladek Signed-off-by: Jessica Yu Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Jiri Kosina Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Petr Mladek Cc: Steven Rostedt Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/20180109235124.30886-2-jeyu@kernel.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 102 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 76 insertions(+), 26 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index da2ccf142358..2d988141ab85 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -978,18 +978,36 @@ static int prepare_kprobe(struct kprobe *p) } /* Caller must lock kprobe_mutex */ -static void arm_kprobe_ftrace(struct kprobe *p) +static int arm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 0, 0); - WARN(ret < 0, "Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); - kprobe_ftrace_enabled++; - if (kprobe_ftrace_enabled == 1) { - ret = register_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); + if (ret) { + pr_debug("Failed to arm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; } + + if (kprobe_ftrace_enabled == 0) { + ret = register_ftrace_function(&kprobe_ftrace_ops); + if (ret) { + pr_debug("Failed to init kprobe-ftrace (%d)\n", ret); + goto err_ftrace; + } + } + + kprobe_ftrace_enabled++; + return ret; + +err_ftrace: + /* + * Note: Since kprobe_ftrace_ops has IPMODIFY set, and ftrace requires a + * non-empty filter_hash for IPMODIFY ops, we're safe from an accidental + * empty filter_hash which would undesirably trace all functions. + */ + ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); + return ret; } /* Caller must lock kprobe_mutex */ @@ -1008,22 +1026,23 @@ static void disarm_kprobe_ftrace(struct kprobe *p) } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) -#define arm_kprobe_ftrace(p) do {} while (0) +#define arm_kprobe_ftrace(p) (-ENODEV) #define disarm_kprobe_ftrace(p) do {} while (0) #endif /* Arm a kprobe with text_mutex */ -static void arm_kprobe(struct kprobe *kp) +static int arm_kprobe(struct kprobe *kp) { - if (unlikely(kprobe_ftrace(kp))) { - arm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return arm_kprobe_ftrace(kp); + cpus_read_lock(); mutex_lock(&text_mutex); __arm_kprobe(kp); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* Disarm a kprobe with text_mutex */ @@ -1362,9 +1381,15 @@ out: if (ret == 0 && kprobe_disabled(ap) && !kprobe_disabled(p)) { ap->flags &= ~KPROBE_FLAG_DISABLED; - if (!kprobes_all_disarmed) + if (!kprobes_all_disarmed) { /* Arm the breakpoint again. */ - arm_kprobe(ap); + ret = arm_kprobe(ap); + if (ret) { + ap->flags |= KPROBE_FLAG_DISABLED; + list_del_rcu(&p->list); + synchronize_sched(); + } + } } return ret; } @@ -1573,8 +1598,14 @@ int register_kprobe(struct kprobe *p) hlist_add_head_rcu(&p->hlist, &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); - if (!kprobes_all_disarmed && !kprobe_disabled(p)) - arm_kprobe(p); + if (!kprobes_all_disarmed && !kprobe_disabled(p)) { + ret = arm_kprobe(p); + if (ret) { + hlist_del_rcu(&p->hlist); + synchronize_sched(); + goto out; + } + } /* Try to optimize kprobe */ try_to_optimize_kprobe(p); @@ -2116,7 +2147,9 @@ int enable_kprobe(struct kprobe *kp) if (!kprobes_all_disarmed && kprobe_disabled(p)) { p->flags &= ~KPROBE_FLAG_DISABLED; - arm_kprobe(p); + ret = arm_kprobe(p); + if (ret) + p->flags |= KPROBE_FLAG_DISABLED; } out: mutex_unlock(&kprobe_mutex); @@ -2407,11 +2440,12 @@ static const struct file_operations debugfs_kprobe_blacklist_ops = { .release = seq_release, }; -static void arm_all_kprobes(void) +static int arm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); @@ -2428,16 +2462,28 @@ static void arm_all_kprobes(void) /* Arming kprobes doesn't optimize kprobe itself */ for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; - hlist_for_each_entry_rcu(p, head, hlist) - if (!kprobe_disabled(p)) - arm_kprobe(p); + /* Arm all kprobes on a best-effort basis */ + hlist_for_each_entry_rcu(p, head, hlist) { + if (!kprobe_disabled(p)) { + err = arm_kprobe(p); + if (err) { + errors++; + ret = err; + } + total++; + } + } } - printk(KERN_INFO "Kprobes globally enabled\n"); + if (errors) + pr_warn("Kprobes globally enabled, but failed to arm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally enabled\n"); already_enabled: mutex_unlock(&kprobe_mutex); - return; + return ret; } static void disarm_all_kprobes(void) @@ -2494,6 +2540,7 @@ static ssize_t write_enabled_file_bool(struct file *file, { char buf[32]; size_t buf_size; + int ret = 0; buf_size = min(count, (sizeof(buf)-1)); if (copy_from_user(buf, user_buf, buf_size)) @@ -2504,7 +2551,7 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'y': case 'Y': case '1': - arm_all_kprobes(); + ret = arm_all_kprobes(); break; case 'n': case 'N': @@ -2515,6 +2562,9 @@ static ssize_t write_enabled_file_bool(struct file *file, return -EINVAL; } + if (ret) + return ret; + return count; } From 297f9233b53a08fd457815e19f1d6f2c3389857b Mon Sep 17 00:00:00 2001 From: Jessica Yu Date: Wed, 10 Jan 2018 00:51:24 +0100 Subject: [PATCH 128/149] kprobes: Propagate error from disarm_kprobe_ftrace() Improve error handling when disarming ftrace-based kprobes. Like with arm_kprobe_ftrace(), propagate any errors from disarm_kprobe_ftrace() so that we do not disable/unregister kprobes that are still armed. In other words, unregister_kprobe() and disable_kprobe() should not report success if the kprobe could not be disarmed. disarm_all_kprobes() keeps its current behavior and attempts to disarm all kprobes. It returns the last encountered error and gives a warning if not all probes could be disarmed. This patch is based on Petr Mladek's original patchset (patches 2 and 3) back in 2015, which improved kprobes error handling, found here: https://lkml.org/lkml/2015/2/26/452 However, further work on this had been paused since then and the patches were not upstreamed. Based-on-patches-by: Petr Mladek Signed-off-by: Jessica Yu Acked-by: Masami Hiramatsu Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Jiri Kosina Cc: Joe Lawrence Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Miroslav Benes Cc: Peter Zijlstra Cc: Petr Mladek Cc: Steven Rostedt Cc: Thomas Gleixner Cc: live-patching@vger.kernel.org Link: http://lkml.kernel.org/r/20180109235124.30886-3-jeyu@kernel.org Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 80 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 54 insertions(+), 26 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 2d988141ab85..102160ff5c66 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1011,23 +1011,27 @@ err_ftrace: } /* Caller must lock kprobe_mutex */ -static void disarm_kprobe_ftrace(struct kprobe *p) +static int disarm_kprobe_ftrace(struct kprobe *p) { - int ret; + int ret = 0; + + if (kprobe_ftrace_enabled == 1) { + ret = unregister_ftrace_function(&kprobe_ftrace_ops); + if (WARN(ret < 0, "Failed to unregister kprobe-ftrace (%d)\n", ret)) + return ret; + } kprobe_ftrace_enabled--; - if (kprobe_ftrace_enabled == 0) { - ret = unregister_ftrace_function(&kprobe_ftrace_ops); - WARN(ret < 0, "Failed to init kprobe-ftrace (%d)\n", ret); - } + ret = ftrace_set_filter_ip(&kprobe_ftrace_ops, (unsigned long)p->addr, 1, 0); WARN(ret < 0, "Failed to disarm kprobe-ftrace at %p (%d)\n", p->addr, ret); + return ret; } #else /* !CONFIG_KPROBES_ON_FTRACE */ #define prepare_kprobe(p) arch_prepare_kprobe(p) #define arm_kprobe_ftrace(p) (-ENODEV) -#define disarm_kprobe_ftrace(p) do {} while (0) +#define disarm_kprobe_ftrace(p) (-ENODEV) #endif /* Arm a kprobe with text_mutex */ @@ -1046,18 +1050,18 @@ static int arm_kprobe(struct kprobe *kp) } /* Disarm a kprobe with text_mutex */ -static void disarm_kprobe(struct kprobe *kp, bool reopt) +static int disarm_kprobe(struct kprobe *kp, bool reopt) { - if (unlikely(kprobe_ftrace(kp))) { - disarm_kprobe_ftrace(kp); - return; - } + if (unlikely(kprobe_ftrace(kp))) + return disarm_kprobe_ftrace(kp); cpus_read_lock(); mutex_lock(&text_mutex); __disarm_kprobe(kp, reopt); mutex_unlock(&text_mutex); cpus_read_unlock(); + + return 0; } /* @@ -1639,11 +1643,12 @@ static int aggr_kprobe_disabled(struct kprobe *ap) static struct kprobe *__disable_kprobe(struct kprobe *p) { struct kprobe *orig_p; + int ret; /* Get an original kprobe for return */ orig_p = __get_valid_kprobe(p); if (unlikely(orig_p == NULL)) - return NULL; + return ERR_PTR(-EINVAL); if (!kprobe_disabled(p)) { /* Disable probe if it is a child probe */ @@ -1657,8 +1662,13 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) * should have already been disarmed, so * skip unneed disarming process. */ - if (!kprobes_all_disarmed) - disarm_kprobe(orig_p, true); + if (!kprobes_all_disarmed) { + ret = disarm_kprobe(orig_p, true); + if (ret) { + p->flags &= ~KPROBE_FLAG_DISABLED; + return ERR_PTR(ret); + } + } orig_p->flags |= KPROBE_FLAG_DISABLED; } } @@ -1675,8 +1685,8 @@ static int __unregister_kprobe_top(struct kprobe *p) /* Disable kprobe. This will disarm it if needed. */ ap = __disable_kprobe(p); - if (ap == NULL) - return -EINVAL; + if (IS_ERR(ap)) + return PTR_ERR(ap); if (ap == p) /* @@ -2109,12 +2119,14 @@ static void kill_kprobe(struct kprobe *p) int disable_kprobe(struct kprobe *kp) { int ret = 0; + struct kprobe *p; mutex_lock(&kprobe_mutex); /* Disable this kprobe */ - if (__disable_kprobe(kp) == NULL) - ret = -EINVAL; + p = __disable_kprobe(kp); + if (IS_ERR(p)) + ret = PTR_ERR(p); mutex_unlock(&kprobe_mutex); return ret; @@ -2486,34 +2498,50 @@ already_enabled: return ret; } -static void disarm_all_kprobes(void) +static int disarm_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; - unsigned int i; + unsigned int i, total = 0, errors = 0; + int err, ret = 0; mutex_lock(&kprobe_mutex); /* If kprobes are already disarmed, just return */ if (kprobes_all_disarmed) { mutex_unlock(&kprobe_mutex); - return; + return 0; } kprobes_all_disarmed = true; - printk(KERN_INFO "Kprobes globally disabled\n"); for (i = 0; i < KPROBE_TABLE_SIZE; i++) { head = &kprobe_table[i]; + /* Disarm all kprobes on a best-effort basis */ hlist_for_each_entry_rcu(p, head, hlist) { - if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) - disarm_kprobe(p, false); + if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p)) { + err = disarm_kprobe(p, false); + if (err) { + errors++; + ret = err; + } + total++; + } } } + + if (errors) + pr_warn("Kprobes globally disabled, but failed to disarm %d out of %d probes\n", + errors, total); + else + pr_info("Kprobes globally disabled\n"); + mutex_unlock(&kprobe_mutex); /* Wait for disarming all kprobes by optimizer */ wait_for_kprobe_optimizer(); + + return ret; } /* @@ -2556,7 +2584,7 @@ static ssize_t write_enabled_file_bool(struct file *file, case 'n': case 'N': case '0': - disarm_all_kprobes(); + ret = disarm_all_kprobes(); break; default: return -EINVAL; From f960cfd12650fad43c1cde07a1f7642cf2c57f97 Mon Sep 17 00:00:00 2001 From: Matthew Whitehead Date: Thu, 15 Feb 2018 11:54:54 -0500 Subject: [PATCH 129/149] x86/Kconfig: Add missing i586-class CPUs to the X86_CMPXCHG64 Kconfig group Several i586-class CPUs supporting this instruction are missing from the X86_CMPXCHG64 config group. Using a configuration with either M586TSC or M586MMX currently sets X86_MINIMUM_CPU_FAMILY=4 instead of the correct value of 5. Booting on an i486 it will fail to generate the "This kernel requires an i586 CPU, but only detected an i486 CPU" message and intentional halt as expected. It will instead just silently hang when it hits i586-specific instructions. The M586 CPU is not in this list because at least the Cyrix 5x86 lacks this instruction, and perhaps others. Signed-off-by: Matthew Whitehead Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518713696-11360-1-git-send-email-tedheadster@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 65a9a4716e34..ec64aa728727 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -374,7 +374,7 @@ config X86_TSC config X86_CMPXCHG64 def_bool y - depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM + depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 # this should be set for all -march=.. options where the compiler # generates cmov. From 69b8d3fcabdc81d9efd82b4a506c8279cbaba692 Mon Sep 17 00:00:00 2001 From: Matthew Whitehead Date: Thu, 15 Feb 2018 11:54:55 -0500 Subject: [PATCH 130/149] x86/Kconfig: Exclude i586-class CPUs lacking PAE support from the HIGHMEM64G Kconfig group i586-class machines also lack support for Physical Address Extension (PAE), so add them to the exclusion list. Signed-off-by: Matthew Whitehead Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518713696-11360-2-git-send-email-tedheadster@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a528c14d45a5..c1236b187824 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1404,7 +1404,7 @@ config HIGHMEM4G config HIGHMEM64G bool "64GB" - depends on !M486 + depends on !M486 && !M586 && !M586TSC && !M586MMX && !MGEODE_LX && !MGEODEGX1 && !MCYRIXIII && !MELAN && !MWINCHIPC6 && !WINCHIP3D && !MK6 select X86_PAE ---help--- Select this if you have a 32-bit processor and more than 4 From 25d76ac888216c369dea91768764728b83769799 Mon Sep 17 00:00:00 2001 From: Matthew Whitehead Date: Thu, 15 Feb 2018 11:54:56 -0500 Subject: [PATCH 131/149] x86/Kconfig: Explicitly enumerate i686-class CPUs in Kconfig The X86_P6_NOP config class leaves out many i686-class CPUs. Instead, explicitly enumerate all these CPUs. Using a configuration with M686 currently sets X86_MINIMUM_CPU_FAMILY=5 instead of the correct value of 6. Booting on an i586 it will fail to generate the "This kernel requires an i686 CPU, but only detected an i586 CPU" message and intentional halt as expected. It will instead just silently hang when it hits i686-specific instructions. Signed-off-by: Matthew Whitehead Cc: Andy Lutomirski Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1518713696-11360-3-git-send-email-tedheadster@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index ec64aa728727..8b8d2297d486 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -385,7 +385,7 @@ config X86_CMOV config X86_MINIMUM_CPU_FAMILY int default "64" if X86_64 - default "6" if X86_32 && X86_P6_NOP + default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCRUSOE || MCORE2 || MK7 || MK8) default "5" if X86_32 && X86_CMPXCHG64 default "4" From d207af2eab3f8668b95ad02b21930481c42806fd Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Wed, 14 Feb 2018 02:54:03 +0000 Subject: [PATCH 132/149] cpumask: Make for_each_cpu_wrap() available on UP as well for_each_cpu_wrap() was originally added in the #else half of a large "#if NR_CPUS == 1" statement, but was omitted in the #if half. This patch adds the missing #if half to prevent compile errors when NR_CPUS is 1. Reported-by: kbuild test robot Signed-off-by: Michael Kelley Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kys@microsoft.com Cc: martin.petersen@oracle.com Cc: mikelley@microsoft.com Fixes: c743f0a5c50f ("sched/fair, cpumask: Export for_each_cpu_wrap()") Link: http://lkml.kernel.org/r/SN6PR1901MB2045F087F59450507D4FCC17CBF50@SN6PR1901MB2045.namprd19.prod.outlook.com Signed-off-by: Ingo Molnar --- include/linux/cpumask.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index d4a2a7dcd72d..bf53d893ad02 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -170,6 +170,8 @@ static inline unsigned int cpumask_local_spread(unsigned int i, int node) for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) +#define for_each_cpu_wrap(cpu, mask, start) \ + for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) #define for_each_cpu_and(cpu, mask, and) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)and) #else From 2c10636a0b9c689450e85f9945583920f50337c9 Mon Sep 17 00:00:00 2001 From: Nathan Fontenot Date: Thu, 15 Feb 2018 21:27:41 -0600 Subject: [PATCH 133/149] powerpc/pseries: Check for zero filled ibm,dynamic-memory property Some versions of QEMU will produce an ibm,dynamic-reconfiguration-memory node with a ibm,dynamic-memory property that is zero-filled. This causes the drmem code to oops trying to parse this property. The fix for this is to validate that the property does contain LMB entries before trying to parse it and bail if the count is zero. Oops: Kernel access of bad area, sig: 11 [#1] DAR: 0000000000000010 NIP read_drconf_v1_cell+0x54/0x9c LR read_drconf_v1_cell+0x48/0x9c Call Trace: __param_initcall_debug+0x0/0x28 (unreliable) drmem_init+0x144/0x2f8 do_one_initcall+0x64/0x1d0 kernel_init_freeable+0x298/0x38c kernel_init+0x24/0x160 ret_from_kernel_thread+0x5c/0xb4 The ibm,dynamic-reconfiguration-memory device tree property generated that causes this: ibm,dynamic-reconfiguration-memory { ibm,lmb-size = <0x0 0x10000000>; ibm,memory-flags-mask = <0xff>; ibm,dynamic-memory = <0x0 0x0 0x0 0x0 0x0 0x0>; linux,phandle = <0x7e57eed8>; ibm,associativity-lookup-arrays = <0x1 0x4 0x0 0x0 0x0 0x0>; ibm,memory-preservation-time = <0x0>; }; Signed-off-by: Nathan Fontenot Reviewed-by: Cyril Bur Tested-by: Daniel Black [mpe: Trim oops report] Signed-off-by: Michael Ellerman --- arch/powerpc/mm/drmem.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 1604110c4238..916844f99c64 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -216,6 +216,8 @@ static void __init __walk_drmem_v1_lmbs(const __be32 *prop, const __be32 *usm, u32 i, n_lmbs; n_lmbs = of_read_number(prop++, 1); + if (n_lmbs == 0) + return; for (i = 0; i < n_lmbs; i++) { read_drconf_v1_cell(&lmb, &prop); @@ -245,6 +247,8 @@ static void __init __walk_drmem_v2_lmbs(const __be32 *prop, const __be32 *usm, u32 i, j, lmb_sets; lmb_sets = of_read_number(prop++, 1); + if (lmb_sets == 0) + return; for (i = 0; i < lmb_sets; i++) { read_drconf_v2_cell(&dr_cell, &prop); @@ -354,6 +358,8 @@ static void __init init_drmem_v1_lmbs(const __be32 *prop) struct drmem_lmb *lmb; drmem_info->n_lmbs = of_read_number(prop++, 1); + if (drmem_info->n_lmbs == 0) + return; drmem_info->lmbs = kcalloc(drmem_info->n_lmbs, sizeof(*lmb), GFP_KERNEL); @@ -373,6 +379,8 @@ static void __init init_drmem_v2_lmbs(const __be32 *prop) int lmb_index; lmb_sets = of_read_number(prop++, 1); + if (lmb_sets == 0) + return; /* first pass, calculate the number of LMBs */ p = prop; From 285cb4f62319737e6538252cf1a67ce9da5cf3d5 Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Mon, 5 Feb 2018 16:45:36 +0000 Subject: [PATCH 134/149] irqchip/mips-gic: Avoid spuriously handling masked interrupts Commit 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") removed the read of the hardware mask register when handling shared interrupts, instead using the driver's shadow pcpu_masks entry as the effective mask. Unfortunately this did not take account of the write to pcpu_masks during gic_shared_irq_domain_map, which effectively unmasks the interrupt early. If an interrupt is asserted, gic_handle_shared_int decodes and processes the interrupt even though it has not yet been unmasked via gic_unmask_irq, which also sets the appropriate bit in pcpu_masks. On the MIPS Boston board, when a console command line of "console=ttyS0,115200n8r" is passed, the modem status IRQ is enabled in the UART, which is immediately raised to the GIC. The interrupt has been mapped, but no handler has yet been registered, nor is it expected to be unmasked. However, the write to pcpu_masks in gic_shared_irq_domain_map has effectively unmasked it, resulting in endless reports of: [ 5.058454] irq 13, desc: ffffffff80a7ad80, depth: 1, count: 0, unhandled: 0 [ 5.062057] ->handle_irq(): ffffffff801b1838, [ 5.062175] handle_bad_irq+0x0/0x2c0 Where IRQ 13 is the UART interrupt. To fix this, just remove the write to pcpu_masks in gic_shared_irq_domain_map. The existing write in gic_unmask_irq is the correct place for what is now the effective unmasking. Cc: stable@vger.kernel.org Fixes: 7778c4b27cbe ("irqchip: mips-gic: Use pcpu_masks to avoid reading GIC_SH_MASK*") Signed-off-by: Matt Redfearn Reviewed-by: Paul Burton Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-mips-gic.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index ef92a4d2038e..d32268cc1174 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -424,8 +424,6 @@ static int gic_shared_irq_domain_map(struct irq_domain *d, unsigned int virq, spin_lock_irqsave(&gic_lock, flags); write_gic_map_pin(intr, GIC_MAP_PIN_MAP_TO_PIN | gic_cpu_pin); write_gic_map_vp(intr, BIT(mips_cm_vp_id(cpu))); - gic_clear_pcpu_masks(intr); - set_bit(intr, per_cpu_ptr(pcpu_masks, cpu)); irq_data_update_effective_affinity(data, cpumask_of(cpu)); spin_unlock_irqrestore(&gic_lock, flags); From b6dd4d83dc2f78cebc9a7e6e7e4bc2be4d29b94d Mon Sep 17 00:00:00 2001 From: Mark Salter Date: Fri, 2 Feb 2018 09:20:29 -0500 Subject: [PATCH 135/149] irqchip/gic-v3: Change pr_debug message to pr_devel The pr_debug() in gic-v3 gic_send_sgi() can trigger a circular locking warning: GICv3: CPU10: ICC_SGI1R_EL1 5000400 ====================================================== WARNING: possible circular locking dependency detected 4.15.0+ #1 Tainted: G W ------------------------------------------------------ dynamic_debug01/1873 is trying to acquire lock: ((console_sem).lock){-...}, at: [<0000000099c891ec>] down_trylock+0x20/0x4c but task is already holding lock: (&rq->lock){-.-.}, at: [<00000000842e1587>] __task_rq_lock+0x54/0xdc which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&rq->lock){-.-.}: __lock_acquire+0x3b4/0x6e0 lock_acquire+0xf4/0x2a8 _raw_spin_lock+0x4c/0x60 task_fork_fair+0x3c/0x148 sched_fork+0x10c/0x214 copy_process.isra.32.part.33+0x4e8/0x14f0 _do_fork+0xe8/0x78c kernel_thread+0x48/0x54 rest_init+0x34/0x2a4 start_kernel+0x45c/0x488 -> #1 (&p->pi_lock){-.-.}: __lock_acquire+0x3b4/0x6e0 lock_acquire+0xf4/0x2a8 _raw_spin_lock_irqsave+0x58/0x70 try_to_wake_up+0x48/0x600 wake_up_process+0x28/0x34 __up.isra.0+0x60/0x6c up+0x60/0x68 __up_console_sem+0x4c/0x7c console_unlock+0x328/0x634 vprintk_emit+0x25c/0x390 dev_vprintk_emit+0xc4/0x1fc dev_printk_emit+0x88/0xa8 __dev_printk+0x58/0x9c _dev_info+0x84/0xa8 usb_new_device+0x100/0x474 hub_port_connect+0x280/0x92c hub_event+0x740/0xa84 process_one_work+0x240/0x70c worker_thread+0x60/0x400 kthread+0x110/0x13c ret_from_fork+0x10/0x18 -> #0 ((console_sem).lock){-...}: validate_chain.isra.34+0x6e4/0xa20 __lock_acquire+0x3b4/0x6e0 lock_acquire+0xf4/0x2a8 _raw_spin_lock_irqsave+0x58/0x70 down_trylock+0x20/0x4c __down_trylock_console_sem+0x3c/0x9c console_trylock+0x20/0xb0 vprintk_emit+0x254/0x390 vprintk_default+0x58/0x90 vprintk_func+0xbc/0x164 printk+0x80/0xa0 __dynamic_pr_debug+0x84/0xac gic_raise_softirq+0x184/0x18c smp_cross_call+0xac/0x218 smp_send_reschedule+0x3c/0x48 resched_curr+0x60/0x9c check_preempt_curr+0x70/0xdc wake_up_new_task+0x310/0x470 _do_fork+0x188/0x78c SyS_clone+0x44/0x50 __sys_trace_return+0x0/0x4 other info that might help us debug this: Chain exists of: (console_sem).lock --> &p->pi_lock --> &rq->lock Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&rq->lock); lock(&p->pi_lock); lock(&rq->lock); lock((console_sem).lock); *** DEADLOCK *** 2 locks held by dynamic_debug01/1873: #0: (&p->pi_lock){-.-.}, at: [<000000001366df53>] wake_up_new_task+0x40/0x470 #1: (&rq->lock){-.-.}, at: [<00000000842e1587>] __task_rq_lock+0x54/0xdc stack backtrace: CPU: 10 PID: 1873 Comm: dynamic_debug01 Tainted: G W 4.15.0+ #1 Hardware name: GIGABYTE R120-T34-00/MT30-GS2-00, BIOS T48 10/02/2017 Call trace: dump_backtrace+0x0/0x188 show_stack+0x24/0x2c dump_stack+0xa4/0xe0 print_circular_bug.isra.31+0x29c/0x2b8 check_prev_add.constprop.39+0x6c8/0x6dc validate_chain.isra.34+0x6e4/0xa20 __lock_acquire+0x3b4/0x6e0 lock_acquire+0xf4/0x2a8 _raw_spin_lock_irqsave+0x58/0x70 down_trylock+0x20/0x4c __down_trylock_console_sem+0x3c/0x9c console_trylock+0x20/0xb0 vprintk_emit+0x254/0x390 vprintk_default+0x58/0x90 vprintk_func+0xbc/0x164 printk+0x80/0xa0 __dynamic_pr_debug+0x84/0xac gic_raise_softirq+0x184/0x18c smp_cross_call+0xac/0x218 smp_send_reschedule+0x3c/0x48 resched_curr+0x60/0x9c check_preempt_curr+0x70/0xdc wake_up_new_task+0x310/0x470 _do_fork+0x188/0x78c SyS_clone+0x44/0x50 __sys_trace_return+0x0/0x4 GICv3: CPU0: ICC_SGI1R_EL1 12000 This could be fixed with printk_deferred() but that might lessen its usefulness for debugging. So change it to pr_devel to keep it out of production kernels. Developers working on gic-v3 can enable it as needed in their kernels. Signed-off-by: Mark Salter Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index a57c0fbbd34a..d71be9a1f9d2 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -673,7 +673,7 @@ static void gic_send_sgi(u64 cluster_id, u16 tlist, unsigned int irq) MPIDR_TO_SGI_RS(cluster_id) | tlist << ICC_SGI1R_TARGET_LIST_SHIFT); - pr_debug("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val); + pr_devel("CPU%d: ICC_SGI1R_EL1 %llx\n", smp_processor_id(), val); gic_write_sgi1r(val); } From 21ec30c0ef5234fb1039cc7c7737d885bf875a9e Mon Sep 17 00:00:00 2001 From: Shanker Donthineni Date: Wed, 31 Jan 2018 18:03:42 -0600 Subject: [PATCH 136/149] irqchip/gic-v3: Use wmb() instead of smb_wmb() in gic_raise_softirq() A DMB instruction can be used to ensure the relative order of only memory accesses before and after the barrier. Since writes to system registers are not memory operations, barrier DMB is not sufficient for observability of memory accesses that occur before ICC_SGI1R_EL1 writes. A DSB instruction ensures that no instructions that appear in program order after the DSB instruction, can execute until the DSB instruction has completed. Cc: stable@vger.kernel.org Acked-by: Will Deacon , Signed-off-by: Shanker Donthineni Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v3.c b/drivers/irqchip/irq-gic-v3.c index d71be9a1f9d2..d99cc07903ec 100644 --- a/drivers/irqchip/irq-gic-v3.c +++ b/drivers/irqchip/irq-gic-v3.c @@ -688,7 +688,7 @@ static void gic_raise_softirq(const struct cpumask *mask, unsigned int irq) * Ensure that stores to Normal memory are visible to the * other CPUs before issuing the IPI. */ - smp_wmb(); + wmb(); for_each_cpu(cpu, mask) { u64 cluster_id = MPIDR_TO_SGI_CLUSTER_ID(cpu_logical_map(cpu)); From 95a2562590c2f64a0398183f978d5cf3db6d0284 Mon Sep 17 00:00:00 2001 From: Stephen Boyd Date: Thu, 1 Feb 2018 09:03:29 -0800 Subject: [PATCH 137/149] irqchip/gic-v3: Ignore disabled ITS nodes On some platforms there's an ITS available but it's not enabled because reading or writing the registers is denied by the firmware. In fact, reading or writing them will cause the system to reset. We could remove the node from DT in such a case, but it's better to skip nodes that are marked as "disabled" in DT so that we can describe the hardware that exists and use the status property to indicate how the firmware has configured things. Cc: Stuart Yoder Cc: Laurentiu Tudor Cc: Greg Kroah-Hartman Cc: Marc Zyngier Cc: Rajendra Nayak Signed-off-by: Stephen Boyd Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v3-its-pci-msi.c | 2 ++ drivers/irqchip/irq-gic-v3-its-platform-msi.c | 2 ++ drivers/irqchip/irq-gic-v3-its.c | 2 ++ drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c | 2 ++ 4 files changed, 8 insertions(+) diff --git a/drivers/irqchip/irq-gic-v3-its-pci-msi.c b/drivers/irqchip/irq-gic-v3-its-pci-msi.c index 14a8c0a7e095..25a98de5cfb2 100644 --- a/drivers/irqchip/irq-gic-v3-its-pci-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-pci-msi.c @@ -132,6 +132,8 @@ static int __init its_pci_of_msi_init(void) for (np = of_find_matching_node(NULL, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { + if (!of_device_is_available(np)) + continue; if (!of_property_read_bool(np, "msi-controller")) continue; diff --git a/drivers/irqchip/irq-gic-v3-its-platform-msi.c b/drivers/irqchip/irq-gic-v3-its-platform-msi.c index 833a90fe33ae..8881a053c173 100644 --- a/drivers/irqchip/irq-gic-v3-its-platform-msi.c +++ b/drivers/irqchip/irq-gic-v3-its-platform-msi.c @@ -154,6 +154,8 @@ static void __init its_pmsi_of_init(void) for (np = of_find_matching_node(NULL, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { + if (!of_device_is_available(np)) + continue; if (!of_property_read_bool(np, "msi-controller")) continue; diff --git a/drivers/irqchip/irq-gic-v3-its.c b/drivers/irqchip/irq-gic-v3-its.c index 06f025fd5726..1d3056f53747 100644 --- a/drivers/irqchip/irq-gic-v3-its.c +++ b/drivers/irqchip/irq-gic-v3-its.c @@ -3314,6 +3314,8 @@ static int __init its_of_probe(struct device_node *node) for (np = of_find_matching_node(node, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { + if (!of_device_is_available(np)) + continue; if (!of_property_read_bool(np, "msi-controller")) { pr_warn("%pOF: no msi-controller property, ITS ignored\n", np); diff --git a/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c b/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c index 5064d5ddf581..fc2013aade51 100644 --- a/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c +++ b/drivers/staging/fsl-mc/bus/irq-gic-v3-its-fsl-mc-msi.c @@ -73,6 +73,8 @@ static int __init its_fsl_mc_msi_init(void) for (np = of_find_matching_node(NULL, its_device_id); np; np = of_find_matching_node(np, its_device_id)) { + if (!of_device_is_available(np)) + continue; if (!of_property_read_bool(np, "msi-controller")) continue; From de337ee301422756dff43d6c60fbb0400c1235e9 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 6 Feb 2018 18:55:33 +0000 Subject: [PATCH 138/149] irqchip/gic-v2m: Add PCI Multi-MSI support We'd never implemented Multi-MSI support with GICv2m, because it is weird and clunky, and you'd think people would rather use MSI-X. Turns out there is still plenty of devices out there that rely on Multi-MSI. Oh well, let's teach that trick to the v2m widget, it is not a big deal anyway. Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-gic-v2m.c | 46 +++++++++++++++++------------------ 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index 993a8426a453..1ff38aff9f29 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -94,7 +94,7 @@ static struct irq_chip gicv2m_msi_irq_chip = { static struct msi_domain_info gicv2m_msi_domain_info = { .flags = (MSI_FLAG_USE_DEF_DOM_OPS | MSI_FLAG_USE_DEF_CHIP_OPS | - MSI_FLAG_PCI_MSIX), + MSI_FLAG_PCI_MSIX | MSI_FLAG_MULTI_PCI_MSI), .chip = &gicv2m_msi_irq_chip, }; @@ -155,18 +155,12 @@ static int gicv2m_irq_gic_domain_alloc(struct irq_domain *domain, return 0; } -static void gicv2m_unalloc_msi(struct v2m_data *v2m, unsigned int hwirq) +static void gicv2m_unalloc_msi(struct v2m_data *v2m, unsigned int hwirq, + int nr_irqs) { - int pos; - - pos = hwirq - v2m->spi_start; - if (pos < 0 || pos >= v2m->nr_spis) { - pr_err("Failed to teardown msi. Invalid hwirq %d\n", hwirq); - return; - } - spin_lock(&v2m_lock); - __clear_bit(pos, v2m->bm); + bitmap_release_region(v2m->bm, hwirq - v2m->spi_start, + get_count_order(nr_irqs)); spin_unlock(&v2m_lock); } @@ -174,13 +168,13 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, unsigned int nr_irqs, void *args) { struct v2m_data *v2m = NULL, *tmp; - int hwirq, offset, err = 0; + int hwirq, offset, i, err = 0; spin_lock(&v2m_lock); list_for_each_entry(tmp, &v2m_nodes, entry) { - offset = find_first_zero_bit(tmp->bm, tmp->nr_spis); - if (offset < tmp->nr_spis) { - __set_bit(offset, tmp->bm); + offset = bitmap_find_free_region(tmp->bm, tmp->nr_spis, + get_count_order(nr_irqs)); + if (offset >= 0) { v2m = tmp; break; } @@ -192,16 +186,21 @@ static int gicv2m_irq_domain_alloc(struct irq_domain *domain, unsigned int virq, hwirq = v2m->spi_start + offset; - err = gicv2m_irq_gic_domain_alloc(domain, virq, hwirq); - if (err) { - gicv2m_unalloc_msi(v2m, hwirq); - return err; + for (i = 0; i < nr_irqs; i++) { + err = gicv2m_irq_gic_domain_alloc(domain, virq + i, hwirq + i); + if (err) + goto fail; + + irq_domain_set_hwirq_and_chip(domain, virq + i, hwirq + i, + &gicv2m_irq_chip, v2m); } - irq_domain_set_hwirq_and_chip(domain, virq, hwirq, - &gicv2m_irq_chip, v2m); - return 0; + +fail: + irq_domain_free_irqs_parent(domain, virq, nr_irqs); + gicv2m_unalloc_msi(v2m, hwirq, get_count_order(nr_irqs)); + return err; } static void gicv2m_irq_domain_free(struct irq_domain *domain, @@ -210,8 +209,7 @@ static void gicv2m_irq_domain_free(struct irq_domain *domain, struct irq_data *d = irq_domain_get_irq_data(domain, virq); struct v2m_data *v2m = irq_data_get_irq_chip_data(d); - BUG_ON(nr_irqs != 1); - gicv2m_unalloc_msi(v2m, d->hwirq); + gicv2m_unalloc_msi(v2m, d->hwirq, nr_irqs); irq_domain_free_irqs_parent(domain, virq, nr_irqs); } From 2d02424e89eca71b3fa5e832e6fbe467a413e3d5 Mon Sep 17 00:00:00 2001 From: Jaedon Shin Date: Mon, 12 Feb 2018 11:18:12 +0900 Subject: [PATCH 139/149] irqchip/bcm: Remove hashed address printing Since commit ad67b74d2469 ("printk: hash addresses printed with %p") pointers are being hashed when printed. Displaying the virtual memory at bootup time is not helpful. so delete the prints. Acked-by: Florian Fainelli Signed-off-by: Jaedon Shin Signed-off-by: Marc Zyngier --- drivers/irqchip/irq-bcm7038-l1.c | 3 --- drivers/irqchip/irq-bcm7120-l2.c | 3 --- drivers/irqchip/irq-brcmstb-l2.c | 3 --- 3 files changed, 9 deletions(-) diff --git a/drivers/irqchip/irq-bcm7038-l1.c b/drivers/irqchip/irq-bcm7038-l1.c index 55cfb986225b..faf734ff4cf3 100644 --- a/drivers/irqchip/irq-bcm7038-l1.c +++ b/drivers/irqchip/irq-bcm7038-l1.c @@ -339,9 +339,6 @@ int __init bcm7038_l1_of_init(struct device_node *dn, goto out_unmap; } - pr_info("registered BCM7038 L1 intc (mem: 0x%p, IRQs: %d)\n", - intc->cpus[0]->map_base, IRQS_PER_WORD * intc->n_words); - return 0; out_unmap: diff --git a/drivers/irqchip/irq-bcm7120-l2.c b/drivers/irqchip/irq-bcm7120-l2.c index 983640eba418..8968e5e93fcb 100644 --- a/drivers/irqchip/irq-bcm7120-l2.c +++ b/drivers/irqchip/irq-bcm7120-l2.c @@ -318,9 +318,6 @@ static int __init bcm7120_l2_intc_probe(struct device_node *dn, } } - pr_info("registered %s intc (mem: 0x%p, parent IRQ(s): %d)\n", - intc_name, data->map_base[0], data->num_parent_irqs); - return 0; out_free_domain: diff --git a/drivers/irqchip/irq-brcmstb-l2.c b/drivers/irqchip/irq-brcmstb-l2.c index 691d20eb0bec..0e65f609352e 100644 --- a/drivers/irqchip/irq-brcmstb-l2.c +++ b/drivers/irqchip/irq-brcmstb-l2.c @@ -262,9 +262,6 @@ static int __init brcmstb_l2_intc_of_init(struct device_node *np, ct->chip.irq_set_wake = irq_gc_set_wake; } - pr_info("registered L2 intc (mem: 0x%p, parent irq: %d)\n", - base, parent_irq); - return 0; out_free_domain: From 0b24a0bbe2147815d982d9335c41bb10c04f40bc Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 14 Feb 2018 17:47:35 +0200 Subject: [PATCH 140/149] irqdomain: Re-use DEFINE_SHOW_ATTRIBUTE() macro ...instead of open coding file operations followed by custom ->open() callbacks per each attribute. Signed-off-by: Andy Shevchenko Signed-off-by: Marc Zyngier --- kernel/irq/irqdomain.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index e6a9c36470ee..82b8b18ee1eb 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1726,25 +1726,14 @@ static int irq_domain_debug_show(struct seq_file *m, void *p) irq_domain_debug_show_one(m, d, 0); return 0; } - -static int irq_domain_debug_open(struct inode *inode, struct file *file) -{ - return single_open(file, irq_domain_debug_show, inode->i_private); -} - -static const struct file_operations dfs_domain_ops = { - .open = irq_domain_debug_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; +DEFINE_SHOW_ATTRIBUTE(irq_domain_debug); static void debugfs_add_domain_dir(struct irq_domain *d) { if (!d->name || !domain_dir || d->debugfs_file) return; d->debugfs_file = debugfs_create_file(d->name, 0444, domain_dir, d, - &dfs_domain_ops); + &irq_domain_debug_fops); } static void debugfs_remove_domain_dir(struct irq_domain *d) @@ -1760,7 +1749,8 @@ void __init irq_domain_debugfs_init(struct dentry *root) if (!domain_dir) return; - debugfs_create_file("default", 0444, domain_dir, NULL, &dfs_domain_ops); + debugfs_create_file("default", 0444, domain_dir, NULL, + &irq_domain_debug_fops); mutex_lock(&irq_domain_mutex); list_for_each_entry(d, &irq_domain_list, link) debugfs_add_domain_dir(d); From 8dd601fa8317243be887458c49f6c29c2f3d719f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 15 Feb 2018 20:00:15 +1100 Subject: [PATCH 141/149] dm: correctly handle chained bios in dec_pending() dec_pending() is given an error status (possibly 0) to be recorded against a bio. It can be called several times on the one 'struct dm_io', and it is careful to only assign a non-zero error to io->status. However when it then assigned io->status to bio->bi_status, it is not careful and could overwrite a genuine error status with 0. This can happen when chained bios are in use. If a bio is chained beneath the bio that this dm_io is handling, the child bio might complete and set bio->bi_status before the dm_io completes. This has been possible since chained bios were introduced in 3.14, and has become a lot easier to trigger with commit 18a25da84354 ("dm: ensure bio submission follows a depth-first tree walk") as that commit caused dm to start using chained bios itself. A particular failure mode is that if a bio spans an 'error' target and a working target, the 'error' fragment will complete instantly and set the ->bi_status, and the other fragment will normally complete a little later, and will clear ->bi_status. The fix is simply to only assign io_error to bio->bi_status when io_error is not zero. Reported-and-tested-by: Milan Broz Cc: stable@vger.kernel.org (v3.14+) Signed-off-by: NeilBrown Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index d6de00f367ef..68136806d365 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -903,7 +903,8 @@ static void dec_pending(struct dm_io *io, blk_status_t error) queue_io(md, bio); } else { /* done with normal IO or empty flush */ - bio->bi_status = io_error; + if (io_error) + bio->bi_status = io_error; bio_endio(bio); } } From af27d9403f5b80685b79c88425086edccecaf711 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 16 Feb 2018 16:25:53 +0100 Subject: [PATCH 142/149] mm: hide a #warning for COMPILE_TEST We get a warning about some slow configurations in randconfig kernels: mm/memory.c:83:2: error: #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. [-Werror=cpp] The warning is reasonable by itself, but gets in the way of randconfig build testing, so I'm hiding it whenever CONFIG_COMPILE_TEST is set. The warning was added in 2013 in commit 75980e97dacc ("mm: fold page->_last_nid into page->flags where possible"). Cc: stable@vger.kernel.org Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index dd8de96f5547..5fcfc24904d1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -80,7 +80,7 @@ #include "internal.h" -#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif From 20a004e7b017cce282a46ac5d02c2b9c6b9bb1fa Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 15 Feb 2018 11:14:56 +0000 Subject: [PATCH 143/149] arm64: mm: Use READ_ONCE/WRITE_ONCE when accessing page tables In many cases, page tables can be accessed concurrently by either another CPU (due to things like fast gup) or by the hardware page table walker itself, which may set access/dirty bits. In such cases, it is important to use READ_ONCE/WRITE_ONCE when accessing page table entries so that entries cannot be torn, merged or subject to apparent loss of coherence due to compiler transformations. Whilst there are some scenarios where this cannot happen (e.g. pinned kernel mappings for the linear region), the overhead of using READ_ONCE /WRITE_ONCE everywhere is minimal and makes the code an awful lot easier to reason about. This patch consistently uses these macros in the arch code, as well as explicitly namespacing pointers to page table entries from the entries themselves by using adopting a 'p' suffix for the former (as is sometimes used elsewhere in the kernel source). Tested-by: Yury Norov Tested-by: Richard Ruigrok Reviewed-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/hugetlb.h | 2 +- arch/arm64/include/asm/kvm_mmu.h | 26 +-- arch/arm64/include/asm/mmu_context.h | 4 +- arch/arm64/include/asm/pgalloc.h | 44 ++--- arch/arm64/include/asm/pgtable.h | 23 ++- arch/arm64/kernel/efi.c | 2 +- arch/arm64/kernel/hibernate.c | 148 +++++++------- arch/arm64/mm/dump.c | 54 ++--- arch/arm64/mm/fault.c | 44 +++-- arch/arm64/mm/hugetlbpage.c | 92 ++++----- arch/arm64/mm/kasan_init.c | 70 +++---- arch/arm64/mm/mmu.c | 282 ++++++++++++++------------- arch/arm64/mm/pageattr.c | 32 +-- 13 files changed, 425 insertions(+), 398 deletions(-) diff --git a/arch/arm64/include/asm/hugetlb.h b/arch/arm64/include/asm/hugetlb.h index 1dca41bea16a..e73f68569624 100644 --- a/arch/arm64/include/asm/hugetlb.h +++ b/arch/arm64/include/asm/hugetlb.h @@ -22,7 +22,7 @@ static inline pte_t huge_ptep_get(pte_t *ptep) { - return *ptep; + return READ_ONCE(*ptep); } diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 9679067a1574..7faed6e48b46 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -185,42 +185,42 @@ static inline pmd_t kvm_s2pmd_mkexec(pmd_t pmd) return pmd; } -static inline void kvm_set_s2pte_readonly(pte_t *pte) +static inline void kvm_set_s2pte_readonly(pte_t *ptep) { pteval_t old_pteval, pteval; - pteval = READ_ONCE(pte_val(*pte)); + pteval = READ_ONCE(pte_val(*ptep)); do { old_pteval = pteval; pteval &= ~PTE_S2_RDWR; pteval |= PTE_S2_RDONLY; - pteval = cmpxchg_relaxed(&pte_val(*pte), old_pteval, pteval); + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); } while (pteval != old_pteval); } -static inline bool kvm_s2pte_readonly(pte_t *pte) +static inline bool kvm_s2pte_readonly(pte_t *ptep) { - return (pte_val(*pte) & PTE_S2_RDWR) == PTE_S2_RDONLY; + return (READ_ONCE(pte_val(*ptep)) & PTE_S2_RDWR) == PTE_S2_RDONLY; } -static inline bool kvm_s2pte_exec(pte_t *pte) +static inline bool kvm_s2pte_exec(pte_t *ptep) { - return !(pte_val(*pte) & PTE_S2_XN); + return !(READ_ONCE(pte_val(*ptep)) & PTE_S2_XN); } -static inline void kvm_set_s2pmd_readonly(pmd_t *pmd) +static inline void kvm_set_s2pmd_readonly(pmd_t *pmdp) { - kvm_set_s2pte_readonly((pte_t *)pmd); + kvm_set_s2pte_readonly((pte_t *)pmdp); } -static inline bool kvm_s2pmd_readonly(pmd_t *pmd) +static inline bool kvm_s2pmd_readonly(pmd_t *pmdp) { - return kvm_s2pte_readonly((pte_t *)pmd); + return kvm_s2pte_readonly((pte_t *)pmdp); } -static inline bool kvm_s2pmd_exec(pmd_t *pmd) +static inline bool kvm_s2pmd_exec(pmd_t *pmdp) { - return !(pmd_val(*pmd) & PMD_S2_XN); + return !(READ_ONCE(pmd_val(*pmdp)) & PMD_S2_XN); } static inline bool kvm_page_empty(void *ptr) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 8d3331985d2e..39ec0b8a689e 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -141,13 +141,13 @@ static inline void cpu_install_idmap(void) * Atomically replaces the active TTBR1_EL1 PGD with a new VA-compatible PGD, * avoiding the possibility of conflicting TLB entries being allocated. */ -static inline void cpu_replace_ttbr1(pgd_t *pgd) +static inline void cpu_replace_ttbr1(pgd_t *pgdp) { typedef void (ttbr_replace_func)(phys_addr_t); extern ttbr_replace_func idmap_cpu_replace_ttbr1; ttbr_replace_func *replace_phys; - phys_addr_t pgd_phys = virt_to_phys(pgd); + phys_addr_t pgd_phys = virt_to_phys(pgdp); replace_phys = (void *)__pa_symbol(idmap_cpu_replace_ttbr1); diff --git a/arch/arm64/include/asm/pgalloc.h b/arch/arm64/include/asm/pgalloc.h index e9d9f1b006ef..2e05bcd944c8 100644 --- a/arch/arm64/include/asm/pgalloc.h +++ b/arch/arm64/include/asm/pgalloc.h @@ -36,23 +36,23 @@ static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr) return (pmd_t *)__get_free_page(PGALLOC_GFP); } -static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd) +static inline void pmd_free(struct mm_struct *mm, pmd_t *pmdp) { - BUG_ON((unsigned long)pmd & (PAGE_SIZE-1)); - free_page((unsigned long)pmd); + BUG_ON((unsigned long)pmdp & (PAGE_SIZE-1)); + free_page((unsigned long)pmdp); } -static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) +static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) { - set_pud(pud, __pud(__phys_to_pud_val(pmd) | prot)); + set_pud(pudp, __pud(__phys_to_pud_val(pmdp) | prot)); } -static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) +static inline void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmdp) { - __pud_populate(pud, __pa(pmd), PMD_TYPE_TABLE); + __pud_populate(pudp, __pa(pmdp), PMD_TYPE_TABLE); } #else -static inline void __pud_populate(pud_t *pud, phys_addr_t pmd, pudval_t prot) +static inline void __pud_populate(pud_t *pudp, phys_addr_t pmdp, pudval_t prot) { BUILD_BUG(); } @@ -65,30 +65,30 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) return (pud_t *)__get_free_page(PGALLOC_GFP); } -static inline void pud_free(struct mm_struct *mm, pud_t *pud) +static inline void pud_free(struct mm_struct *mm, pud_t *pudp) { - BUG_ON((unsigned long)pud & (PAGE_SIZE-1)); - free_page((unsigned long)pud); + BUG_ON((unsigned long)pudp & (PAGE_SIZE-1)); + free_page((unsigned long)pudp); } -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) { - set_pgd(pgdp, __pgd(__phys_to_pgd_val(pud) | prot)); + set_pgd(pgdp, __pgd(__phys_to_pgd_val(pudp) | prot)); } -static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud) +static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgdp, pud_t *pudp) { - __pgd_populate(pgd, __pa(pud), PUD_TYPE_TABLE); + __pgd_populate(pgdp, __pa(pudp), PUD_TYPE_TABLE); } #else -static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pud, pgdval_t prot) +static inline void __pgd_populate(pgd_t *pgdp, phys_addr_t pudp, pgdval_t prot) { BUILD_BUG(); } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ extern pgd_t *pgd_alloc(struct mm_struct *mm); -extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); +extern void pgd_free(struct mm_struct *mm, pgd_t *pgdp); static inline pte_t * pte_alloc_one_kernel(struct mm_struct *mm, unsigned long addr) @@ -114,10 +114,10 @@ pte_alloc_one(struct mm_struct *mm, unsigned long addr) /* * Free a PTE table. */ -static inline void pte_free_kernel(struct mm_struct *mm, pte_t *pte) +static inline void pte_free_kernel(struct mm_struct *mm, pte_t *ptep) { - if (pte) - free_page((unsigned long)pte); + if (ptep) + free_page((unsigned long)ptep); } static inline void pte_free(struct mm_struct *mm, pgtable_t pte) @@ -126,10 +126,10 @@ static inline void pte_free(struct mm_struct *mm, pgtable_t pte) __free_page(pte); } -static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t pte, +static inline void __pmd_populate(pmd_t *pmdp, phys_addr_t ptep, pmdval_t prot) { - set_pmd(pmdp, __pmd(__phys_to_pmd_val(pte) | prot)); + set_pmd(pmdp, __pmd(__phys_to_pmd_val(ptep) | prot)); } /* diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 094374c82db0..7e2c27e63cd8 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -218,7 +218,7 @@ static inline pmd_t pmd_mkcont(pmd_t pmd) static inline void set_pte(pte_t *ptep, pte_t pte) { - *ptep = pte; + WRITE_ONCE(*ptep, pte); /* * Only if the new pte is valid and kernel, otherwise TLB maintenance @@ -250,6 +250,8 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { + pte_t old_pte; + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) __sync_icache_dcache(pte, addr); @@ -258,14 +260,15 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(*ptep) && pte_valid(pte) && + old_pte = READ_ONCE(*ptep); + if (IS_ENABLED(CONFIG_DEBUG_VM) && pte_valid(old_pte) && pte_valid(pte) && (mm == current->active_mm || atomic_read(&mm->mm_users) > 1)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", - __func__, pte_val(*ptep), pte_val(pte)); - VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(pte), + __func__, pte_val(old_pte), pte_val(pte)); + VM_WARN_ONCE(pte_write(old_pte) && !pte_dirty(pte), "%s: racy dirty state clearing: 0x%016llx -> 0x%016llx", - __func__, pte_val(*ptep), pte_val(pte)); + __func__, pte_val(old_pte), pte_val(pte)); } set_pte(ptep, pte); @@ -431,7 +434,7 @@ extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { - *pmdp = pmd; + WRITE_ONCE(*pmdp, pmd); dsb(ishst); isb(); } @@ -482,7 +485,7 @@ static inline phys_addr_t pmd_page_paddr(pmd_t pmd) static inline void set_pud(pud_t *pudp, pud_t pud) { - *pudp = pud; + WRITE_ONCE(*pudp, pud); dsb(ishst); isb(); } @@ -500,7 +503,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) /* Find an entry in the second-level page table. */ #define pmd_index(addr) (((addr) >> PMD_SHIFT) & (PTRS_PER_PMD - 1)) -#define pmd_offset_phys(dir, addr) (pud_page_paddr(*(dir)) + pmd_index(addr) * sizeof(pmd_t)) +#define pmd_offset_phys(dir, addr) (pud_page_paddr(READ_ONCE(*(dir))) + pmd_index(addr) * sizeof(pmd_t)) #define pmd_offset(dir, addr) ((pmd_t *)__va(pmd_offset_phys((dir), (addr)))) #define pmd_set_fixmap(addr) ((pmd_t *)set_fixmap_offset(FIX_PMD, addr)) @@ -535,7 +538,7 @@ static inline phys_addr_t pud_page_paddr(pud_t pud) static inline void set_pgd(pgd_t *pgdp, pgd_t pgd) { - *pgdp = pgd; + WRITE_ONCE(*pgdp, pgd); dsb(ishst); } @@ -552,7 +555,7 @@ static inline phys_addr_t pgd_page_paddr(pgd_t pgd) /* Find an entry in the frst-level page table. */ #define pud_index(addr) (((addr) >> PUD_SHIFT) & (PTRS_PER_PUD - 1)) -#define pud_offset_phys(dir, addr) (pgd_page_paddr(*(dir)) + pud_index(addr) * sizeof(pud_t)) +#define pud_offset_phys(dir, addr) (pgd_page_paddr(READ_ONCE(*(dir))) + pud_index(addr) * sizeof(pud_t)) #define pud_offset(dir, addr) ((pud_t *)__va(pud_offset_phys((dir), (addr)))) #define pud_set_fixmap(addr) ((pud_t *)set_fixmap_offset(FIX_PUD, addr)) diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index f85ac58d08a3..a8bf1c892b90 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -90,7 +90,7 @@ static int __init set_permissions(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { efi_memory_desc_t *md = data; - pte_t pte = *ptep; + pte_t pte = READ_ONCE(*ptep); if (md->attribute & EFI_MEMORY_RO) pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c index f20cf7e99249..1ec5f28c39fc 100644 --- a/arch/arm64/kernel/hibernate.c +++ b/arch/arm64/kernel/hibernate.c @@ -202,10 +202,10 @@ static int create_safe_exec_page(void *src_start, size_t length, gfp_t mask) { int rc = 0; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep; unsigned long dst = (unsigned long)allocator(mask); if (!dst) { @@ -216,38 +216,38 @@ static int create_safe_exec_page(void *src_start, size_t length, memcpy((void *)dst, src_start, length); flush_icache_range(dst, dst + length); - pgd = pgd_offset_raw(allocator(mask), dst_addr); - if (pgd_none(*pgd)) { - pud = allocator(mask); - if (!pud) { + pgdp = pgd_offset_raw(allocator(mask), dst_addr); + if (pgd_none(READ_ONCE(*pgdp))) { + pudp = allocator(mask); + if (!pudp) { rc = -ENOMEM; goto out; } - pgd_populate(&init_mm, pgd, pud); + pgd_populate(&init_mm, pgdp, pudp); } - pud = pud_offset(pgd, dst_addr); - if (pud_none(*pud)) { - pmd = allocator(mask); - if (!pmd) { + pudp = pud_offset(pgdp, dst_addr); + if (pud_none(READ_ONCE(*pudp))) { + pmdp = allocator(mask); + if (!pmdp) { rc = -ENOMEM; goto out; } - pud_populate(&init_mm, pud, pmd); + pud_populate(&init_mm, pudp, pmdp); } - pmd = pmd_offset(pud, dst_addr); - if (pmd_none(*pmd)) { - pte = allocator(mask); - if (!pte) { + pmdp = pmd_offset(pudp, dst_addr); + if (pmd_none(READ_ONCE(*pmdp))) { + ptep = allocator(mask); + if (!ptep) { rc = -ENOMEM; goto out; } - pmd_populate_kernel(&init_mm, pmd, pte); + pmd_populate_kernel(&init_mm, pmdp, ptep); } - pte = pte_offset_kernel(pmd, dst_addr); - set_pte(pte, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC)); + ptep = pte_offset_kernel(pmdp, dst_addr); + set_pte(ptep, pfn_pte(virt_to_pfn(dst), PAGE_KERNEL_EXEC)); /* * Load our new page tables. A strict BBM approach requires that we @@ -263,7 +263,7 @@ static int create_safe_exec_page(void *src_start, size_t length, */ cpu_set_reserved_ttbr0(); local_flush_tlb_all(); - write_sysreg(phys_to_ttbr(virt_to_phys(pgd)), ttbr0_el1); + write_sysreg(phys_to_ttbr(virt_to_phys(pgdp)), ttbr0_el1); isb(); *phys_dst_addr = virt_to_phys((void *)dst); @@ -320,9 +320,9 @@ int swsusp_arch_suspend(void) return ret; } -static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) +static void _copy_pte(pte_t *dst_ptep, pte_t *src_ptep, unsigned long addr) { - pte_t pte = *src_pte; + pte_t pte = READ_ONCE(*src_ptep); if (pte_valid(pte)) { /* @@ -330,7 +330,7 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_pte, pte_mkwrite(pte)); + set_pte(dst_ptep, pte_mkwrite(pte)); } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if @@ -343,112 +343,116 @@ static void _copy_pte(pte_t *dst_pte, pte_t *src_pte, unsigned long addr) */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte))); + set_pte(dst_ptep, pte_mkpresent(pte_mkwrite(pte))); } } -static int copy_pte(pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long start, +static int copy_pte(pmd_t *dst_pmdp, pmd_t *src_pmdp, unsigned long start, unsigned long end) { - pte_t *src_pte; - pte_t *dst_pte; + pte_t *src_ptep; + pte_t *dst_ptep; unsigned long addr = start; - dst_pte = (pte_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pte) + dst_ptep = (pte_t *)get_safe_page(GFP_ATOMIC); + if (!dst_ptep) return -ENOMEM; - pmd_populate_kernel(&init_mm, dst_pmd, dst_pte); - dst_pte = pte_offset_kernel(dst_pmd, start); + pmd_populate_kernel(&init_mm, dst_pmdp, dst_ptep); + dst_ptep = pte_offset_kernel(dst_pmdp, start); - src_pte = pte_offset_kernel(src_pmd, start); + src_ptep = pte_offset_kernel(src_pmdp, start); do { - _copy_pte(dst_pte, src_pte, addr); - } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); + _copy_pte(dst_ptep, src_ptep, addr); + } while (dst_ptep++, src_ptep++, addr += PAGE_SIZE, addr != end); return 0; } -static int copy_pmd(pud_t *dst_pud, pud_t *src_pud, unsigned long start, +static int copy_pmd(pud_t *dst_pudp, pud_t *src_pudp, unsigned long start, unsigned long end) { - pmd_t *src_pmd; - pmd_t *dst_pmd; + pmd_t *src_pmdp; + pmd_t *dst_pmdp; unsigned long next; unsigned long addr = start; - if (pud_none(*dst_pud)) { - dst_pmd = (pmd_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pmd) + if (pud_none(READ_ONCE(*dst_pudp))) { + dst_pmdp = (pmd_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pmdp) return -ENOMEM; - pud_populate(&init_mm, dst_pud, dst_pmd); + pud_populate(&init_mm, dst_pudp, dst_pmdp); } - dst_pmd = pmd_offset(dst_pud, start); + dst_pmdp = pmd_offset(dst_pudp, start); - src_pmd = pmd_offset(src_pud, start); + src_pmdp = pmd_offset(src_pudp, start); do { + pmd_t pmd = READ_ONCE(*src_pmdp); + next = pmd_addr_end(addr, end); - if (pmd_none(*src_pmd)) + if (pmd_none(pmd)) continue; - if (pmd_table(*src_pmd)) { - if (copy_pte(dst_pmd, src_pmd, addr, next)) + if (pmd_table(pmd)) { + if (copy_pte(dst_pmdp, src_pmdp, addr, next)) return -ENOMEM; } else { - set_pmd(dst_pmd, - __pmd(pmd_val(*src_pmd) & ~PMD_SECT_RDONLY)); + set_pmd(dst_pmdp, + __pmd(pmd_val(pmd) & ~PMD_SECT_RDONLY)); } - } while (dst_pmd++, src_pmd++, addr = next, addr != end); + } while (dst_pmdp++, src_pmdp++, addr = next, addr != end); return 0; } -static int copy_pud(pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long start, +static int copy_pud(pgd_t *dst_pgdp, pgd_t *src_pgdp, unsigned long start, unsigned long end) { - pud_t *dst_pud; - pud_t *src_pud; + pud_t *dst_pudp; + pud_t *src_pudp; unsigned long next; unsigned long addr = start; - if (pgd_none(*dst_pgd)) { - dst_pud = (pud_t *)get_safe_page(GFP_ATOMIC); - if (!dst_pud) + if (pgd_none(READ_ONCE(*dst_pgdp))) { + dst_pudp = (pud_t *)get_safe_page(GFP_ATOMIC); + if (!dst_pudp) return -ENOMEM; - pgd_populate(&init_mm, dst_pgd, dst_pud); + pgd_populate(&init_mm, dst_pgdp, dst_pudp); } - dst_pud = pud_offset(dst_pgd, start); + dst_pudp = pud_offset(dst_pgdp, start); - src_pud = pud_offset(src_pgd, start); + src_pudp = pud_offset(src_pgdp, start); do { + pud_t pud = READ_ONCE(*src_pudp); + next = pud_addr_end(addr, end); - if (pud_none(*src_pud)) + if (pud_none(pud)) continue; - if (pud_table(*(src_pud))) { - if (copy_pmd(dst_pud, src_pud, addr, next)) + if (pud_table(pud)) { + if (copy_pmd(dst_pudp, src_pudp, addr, next)) return -ENOMEM; } else { - set_pud(dst_pud, - __pud(pud_val(*src_pud) & ~PMD_SECT_RDONLY)); + set_pud(dst_pudp, + __pud(pud_val(pud) & ~PMD_SECT_RDONLY)); } - } while (dst_pud++, src_pud++, addr = next, addr != end); + } while (dst_pudp++, src_pudp++, addr = next, addr != end); return 0; } -static int copy_page_tables(pgd_t *dst_pgd, unsigned long start, +static int copy_page_tables(pgd_t *dst_pgdp, unsigned long start, unsigned long end) { unsigned long next; unsigned long addr = start; - pgd_t *src_pgd = pgd_offset_k(start); + pgd_t *src_pgdp = pgd_offset_k(start); - dst_pgd = pgd_offset_raw(dst_pgd, start); + dst_pgdp = pgd_offset_raw(dst_pgdp, start); do { next = pgd_addr_end(addr, end); - if (pgd_none(*src_pgd)) + if (pgd_none(READ_ONCE(*src_pgdp))) continue; - if (copy_pud(dst_pgd, src_pgd, addr, next)) + if (copy_pud(dst_pgdp, src_pgdp, addr, next)) return -ENOMEM; - } while (dst_pgd++, src_pgd++, addr = next, addr != end); + } while (dst_pgdp++, src_pgdp++, addr = next, addr != end); return 0; } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 7b60d62ac593..65dfc8571bf8 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -286,48 +286,52 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, } -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) +static void walk_pte(struct pg_state *st, pmd_t *pmdp, unsigned long start) { - pte_t *pte = pte_offset_kernel(pmd, 0UL); + pte_t *ptep = pte_offset_kernel(pmdp, 0UL); unsigned long addr; unsigned i; - for (i = 0; i < PTRS_PER_PTE; i++, pte++) { + for (i = 0; i < PTRS_PER_PTE; i++, ptep++) { addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte)); + note_page(st, addr, 4, READ_ONCE(pte_val(*ptep))); } } -static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) +static void walk_pmd(struct pg_state *st, pud_t *pudp, unsigned long start) { - pmd_t *pmd = pmd_offset(pud, 0UL); + pmd_t *pmdp = pmd_offset(pudp, 0UL); unsigned long addr; unsigned i; - for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { + for (i = 0; i < PTRS_PER_PMD; i++, pmdp++) { + pmd_t pmd = READ_ONCE(*pmdp); + addr = start + i * PMD_SIZE; - if (pmd_none(*pmd) || pmd_sect(*pmd)) { - note_page(st, addr, 3, pmd_val(*pmd)); + if (pmd_none(pmd) || pmd_sect(pmd)) { + note_page(st, addr, 3, pmd_val(pmd)); } else { - BUG_ON(pmd_bad(*pmd)); - walk_pte(st, pmd, addr); + BUG_ON(pmd_bad(pmd)); + walk_pte(st, pmdp, addr); } } } -static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) +static void walk_pud(struct pg_state *st, pgd_t *pgdp, unsigned long start) { - pud_t *pud = pud_offset(pgd, 0UL); + pud_t *pudp = pud_offset(pgdp, 0UL); unsigned long addr; unsigned i; - for (i = 0; i < PTRS_PER_PUD; i++, pud++) { + for (i = 0; i < PTRS_PER_PUD; i++, pudp++) { + pud_t pud = READ_ONCE(*pudp); + addr = start + i * PUD_SIZE; - if (pud_none(*pud) || pud_sect(*pud)) { - note_page(st, addr, 2, pud_val(*pud)); + if (pud_none(pud) || pud_sect(pud)) { + note_page(st, addr, 2, pud_val(pud)); } else { - BUG_ON(pud_bad(*pud)); - walk_pmd(st, pud, addr); + BUG_ON(pud_bad(pud)); + walk_pmd(st, pudp, addr); } } } @@ -335,17 +339,19 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) static void walk_pgd(struct pg_state *st, struct mm_struct *mm, unsigned long start) { - pgd_t *pgd = pgd_offset(mm, 0UL); + pgd_t *pgdp = pgd_offset(mm, 0UL); unsigned i; unsigned long addr; - for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { + for (i = 0; i < PTRS_PER_PGD; i++, pgdp++) { + pgd_t pgd = READ_ONCE(*pgdp); + addr = start + i * PGDIR_SIZE; - if (pgd_none(*pgd)) { - note_page(st, addr, 1, pgd_val(*pgd)); + if (pgd_none(pgd)) { + note_page(st, addr, 1, pgd_val(pgd)); } else { - BUG_ON(pgd_bad(*pgd)); - walk_pud(st, pgd, addr); + BUG_ON(pgd_bad(pgd)); + walk_pud(st, pgdp, addr); } } } diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index f76bb2c3c943..bff11553eb05 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -130,7 +130,8 @@ static void mem_abort_decode(unsigned int esr) void show_pte(unsigned long addr) { struct mm_struct *mm; - pgd_t *pgd; + pgd_t *pgdp; + pgd_t pgd; if (addr < TASK_SIZE) { /* TTBR0 */ @@ -149,33 +150,37 @@ void show_pte(unsigned long addr) return; } - pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgd = %p\n", + pr_alert("%s pgtable: %luk pages, %u-bit VAs, pgdp = %p\n", mm == &init_mm ? "swapper" : "user", PAGE_SIZE / SZ_1K, VA_BITS, mm->pgd); - pgd = pgd_offset(mm, addr); - pr_alert("[%016lx] *pgd=%016llx", addr, pgd_val(*pgd)); + pgdp = pgd_offset(mm, addr); + pgd = READ_ONCE(*pgdp); + pr_alert("[%016lx] pgd=%016llx", addr, pgd_val(pgd)); do { - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; - if (pgd_none(*pgd) || pgd_bad(*pgd)) + if (pgd_none(pgd) || pgd_bad(pgd)) break; - pud = pud_offset(pgd, addr); - pr_cont(", *pud=%016llx", pud_val(*pud)); - if (pud_none(*pud) || pud_bad(*pud)) + pudp = pud_offset(pgdp, addr); + pud = READ_ONCE(*pudp); + pr_cont(", pud=%016llx", pud_val(pud)); + if (pud_none(pud) || pud_bad(pud)) break; - pmd = pmd_offset(pud, addr); - pr_cont(", *pmd=%016llx", pmd_val(*pmd)); - if (pmd_none(*pmd) || pmd_bad(*pmd)) + pmdp = pmd_offset(pudp, addr); + pmd = READ_ONCE(*pmdp); + pr_cont(", pmd=%016llx", pmd_val(pmd)); + if (pmd_none(pmd) || pmd_bad(pmd)) break; - pte = pte_offset_map(pmd, addr); - pr_cont(", *pte=%016llx", pte_val(*pte)); - pte_unmap(pte); + ptep = pte_offset_map(pmdp, addr); + pte = READ_ONCE(*ptep); + pr_cont(", pte=%016llx", pte_val(pte)); + pte_unmap(ptep); } while(0); pr_cont("\n"); @@ -196,8 +201,9 @@ int ptep_set_access_flags(struct vm_area_struct *vma, pte_t entry, int dirty) { pteval_t old_pteval, pteval; + pte_t pte = READ_ONCE(*ptep); - if (pte_same(*ptep, entry)) + if (pte_same(pte, entry)) return 0; /* only preserve the access flags and write permission */ @@ -210,7 +216,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma, * (calculated as: a & b == ~(~a | ~b)). */ pte_val(entry) ^= PTE_RDONLY; - pteval = READ_ONCE(pte_val(*ptep)); + pteval = pte_val(pte); do { old_pteval = pteval; pteval ^= PTE_RDONLY; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 6cb0fa92a651..ecc6818191df 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -54,14 +54,14 @@ static inline pgprot_t pte_pgprot(pte_t pte) static int find_num_contig(struct mm_struct *mm, unsigned long addr, pte_t *ptep, size_t *pgsize) { - pgd_t *pgd = pgd_offset(mm, addr); - pud_t *pud; - pmd_t *pmd; + pgd_t *pgdp = pgd_offset(mm, addr); + pud_t *pudp; + pmd_t *pmdp; *pgsize = PAGE_SIZE; - pud = pud_offset(pgd, addr); - pmd = pmd_offset(pud, addr); - if ((pte_t *)pmd == ptep) { + pudp = pud_offset(pgdp, addr); + pmdp = pmd_offset(pudp, addr); + if ((pte_t *)pmdp == ptep) { *pgsize = PMD_SIZE; return CONT_PMDS; } @@ -181,11 +181,8 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, clear_flush(mm, addr, ptep, pgsize, ncontig); - for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) { - pr_debug("%s: set pte %p to 0x%llx\n", __func__, ptep, - pte_val(pfn_pte(pfn, hugeprot))); + for (i = 0; i < ncontig; i++, ptep++, addr += pgsize, pfn += dpfn) set_pte_at(mm, addr, ptep, pfn_pte(pfn, hugeprot)); - } } void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, @@ -203,20 +200,20 @@ void set_huge_swap_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - pgd_t *pgd; - pud_t *pud; - pte_t *pte = NULL; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; + pte_t *ptep = NULL; - pr_debug("%s: addr:0x%lx sz:0x%lx\n", __func__, addr, sz); - pgd = pgd_offset(mm, addr); - pud = pud_alloc(mm, pgd, addr); - if (!pud) + pgdp = pgd_offset(mm, addr); + pudp = pud_alloc(mm, pgdp, addr); + if (!pudp) return NULL; if (sz == PUD_SIZE) { - pte = (pte_t *)pud; + ptep = (pte_t *)pudp; } else if (sz == (PAGE_SIZE * CONT_PTES)) { - pmd_t *pmd = pmd_alloc(mm, pud, addr); + pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); /* @@ -226,60 +223,55 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, * will be no pte_unmap() to correspond with this * pte_alloc_map(). */ - pte = pte_alloc_map(mm, pmd, addr); + ptep = pte_alloc_map(mm, pmdp, addr); } else if (sz == PMD_SIZE) { if (IS_ENABLED(CONFIG_ARCH_WANT_HUGE_PMD_SHARE) && - pud_none(*pud)) - pte = huge_pmd_share(mm, addr, pud); + pud_none(READ_ONCE(*pudp))) + ptep = huge_pmd_share(mm, addr, pudp); else - pte = (pte_t *)pmd_alloc(mm, pud, addr); + ptep = (pte_t *)pmd_alloc(mm, pudp, addr); } else if (sz == (PMD_SIZE * CONT_PMDS)) { - pmd_t *pmd; - - pmd = pmd_alloc(mm, pud, addr); + pmdp = pmd_alloc(mm, pudp, addr); WARN_ON(addr & (sz - 1)); - return (pte_t *)pmd; + return (pte_t *)pmdp; } - pr_debug("%s: addr:0x%lx sz:0x%lx ret pte=%p/0x%llx\n", __func__, addr, - sz, pte, pte_val(*pte)); - return pte; + return ptep; } pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; + pgd_t *pgdp; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; - pgd = pgd_offset(mm, addr); - pr_debug("%s: addr:0x%lx pgd:%p\n", __func__, addr, pgd); - if (!pgd_present(*pgd)) + pgdp = pgd_offset(mm, addr); + if (!pgd_present(READ_ONCE(*pgdp))) return NULL; - pud = pud_offset(pgd, addr); - if (sz != PUD_SIZE && pud_none(*pud)) + pudp = pud_offset(pgdp, addr); + pud = READ_ONCE(*pudp); + if (sz != PUD_SIZE && pud_none(pud)) return NULL; /* hugepage or swap? */ - if (pud_huge(*pud) || !pud_present(*pud)) - return (pte_t *)pud; + if (pud_huge(pud) || !pud_present(pud)) + return (pte_t *)pudp; /* table; check the next level */ if (sz == CONT_PMD_SIZE) addr &= CONT_PMD_MASK; - pmd = pmd_offset(pud, addr); + pmdp = pmd_offset(pudp, addr); + pmd = READ_ONCE(*pmdp); if (!(sz == PMD_SIZE || sz == CONT_PMD_SIZE) && - pmd_none(*pmd)) + pmd_none(pmd)) return NULL; - if (pmd_huge(*pmd) || !pmd_present(*pmd)) - return (pte_t *)pmd; + if (pmd_huge(pmd) || !pmd_present(pmd)) + return (pte_t *)pmdp; - if (sz == CONT_PTE_SIZE) { - pte_t *pte = pte_offset_kernel(pmd, (addr & CONT_PTE_MASK)); - return pte; - } + if (sz == CONT_PTE_SIZE) + return pte_offset_kernel(pmdp, (addr & CONT_PTE_MASK)); return NULL; } @@ -367,7 +359,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, size_t pgsize; pte_t pte; - if (!pte_cont(*ptep)) { + if (!pte_cont(READ_ONCE(*ptep))) { ptep_set_wrprotect(mm, addr, ptep); return; } @@ -391,7 +383,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, size_t pgsize; int ncontig; - if (!pte_cont(*ptep)) { + if (!pte_cont(READ_ONCE(*ptep))) { ptep_clear_flush(vma, addr, ptep); return; } diff --git a/arch/arm64/mm/kasan_init.c b/arch/arm64/mm/kasan_init.c index 6e02e6fb4c7b..dabfc1ecda3d 100644 --- a/arch/arm64/mm/kasan_init.c +++ b/arch/arm64/mm/kasan_init.c @@ -44,92 +44,92 @@ static phys_addr_t __init kasan_alloc_zeroed_page(int node) return __pa(p); } -static pte_t *__init kasan_pte_offset(pmd_t *pmd, unsigned long addr, int node, +static pte_t *__init kasan_pte_offset(pmd_t *pmdp, unsigned long addr, int node, bool early) { - if (pmd_none(*pmd)) { + if (pmd_none(READ_ONCE(*pmdp))) { phys_addr_t pte_phys = early ? __pa_symbol(kasan_zero_pte) : kasan_alloc_zeroed_page(node); - __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); + __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); } - return early ? pte_offset_kimg(pmd, addr) - : pte_offset_kernel(pmd, addr); + return early ? pte_offset_kimg(pmdp, addr) + : pte_offset_kernel(pmdp, addr); } -static pmd_t *__init kasan_pmd_offset(pud_t *pud, unsigned long addr, int node, +static pmd_t *__init kasan_pmd_offset(pud_t *pudp, unsigned long addr, int node, bool early) { - if (pud_none(*pud)) { + if (pud_none(READ_ONCE(*pudp))) { phys_addr_t pmd_phys = early ? __pa_symbol(kasan_zero_pmd) : kasan_alloc_zeroed_page(node); - __pud_populate(pud, pmd_phys, PMD_TYPE_TABLE); + __pud_populate(pudp, pmd_phys, PMD_TYPE_TABLE); } - return early ? pmd_offset_kimg(pud, addr) : pmd_offset(pud, addr); + return early ? pmd_offset_kimg(pudp, addr) : pmd_offset(pudp, addr); } -static pud_t *__init kasan_pud_offset(pgd_t *pgd, unsigned long addr, int node, +static pud_t *__init kasan_pud_offset(pgd_t *pgdp, unsigned long addr, int node, bool early) { - if (pgd_none(*pgd)) { + if (pgd_none(READ_ONCE(*pgdp))) { phys_addr_t pud_phys = early ? __pa_symbol(kasan_zero_pud) : kasan_alloc_zeroed_page(node); - __pgd_populate(pgd, pud_phys, PMD_TYPE_TABLE); + __pgd_populate(pgdp, pud_phys, PMD_TYPE_TABLE); } - return early ? pud_offset_kimg(pgd, addr) : pud_offset(pgd, addr); + return early ? pud_offset_kimg(pgdp, addr) : pud_offset(pgdp, addr); } -static void __init kasan_pte_populate(pmd_t *pmd, unsigned long addr, +static void __init kasan_pte_populate(pmd_t *pmdp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pte_t *pte = kasan_pte_offset(pmd, addr, node, early); + pte_t *ptep = kasan_pte_offset(pmdp, addr, node, early); do { phys_addr_t page_phys = early ? __pa_symbol(kasan_zero_page) : kasan_alloc_zeroed_page(node); next = addr + PAGE_SIZE; - set_pte(pte, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); - } while (pte++, addr = next, addr != end && pte_none(*pte)); + set_pte(ptep, pfn_pte(__phys_to_pfn(page_phys), PAGE_KERNEL)); + } while (ptep++, addr = next, addr != end && pte_none(READ_ONCE(*ptep))); } -static void __init kasan_pmd_populate(pud_t *pud, unsigned long addr, +static void __init kasan_pmd_populate(pud_t *pudp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pmd_t *pmd = kasan_pmd_offset(pud, addr, node, early); + pmd_t *pmdp = kasan_pmd_offset(pudp, addr, node, early); do { next = pmd_addr_end(addr, end); - kasan_pte_populate(pmd, addr, next, node, early); - } while (pmd++, addr = next, addr != end && pmd_none(*pmd)); + kasan_pte_populate(pmdp, addr, next, node, early); + } while (pmdp++, addr = next, addr != end && pmd_none(READ_ONCE(*pmdp))); } -static void __init kasan_pud_populate(pgd_t *pgd, unsigned long addr, +static void __init kasan_pud_populate(pgd_t *pgdp, unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pud_t *pud = kasan_pud_offset(pgd, addr, node, early); + pud_t *pudp = kasan_pud_offset(pgdp, addr, node, early); do { next = pud_addr_end(addr, end); - kasan_pmd_populate(pud, addr, next, node, early); - } while (pud++, addr = next, addr != end && pud_none(*pud)); + kasan_pmd_populate(pudp, addr, next, node, early); + } while (pudp++, addr = next, addr != end && pud_none(READ_ONCE(*pudp))); } static void __init kasan_pgd_populate(unsigned long addr, unsigned long end, int node, bool early) { unsigned long next; - pgd_t *pgd; + pgd_t *pgdp; - pgd = pgd_offset_k(addr); + pgdp = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - kasan_pud_populate(pgd, addr, next, node, early); - } while (pgd++, addr = next, addr != end); + kasan_pud_populate(pgdp, addr, next, node, early); + } while (pgdp++, addr = next, addr != end); } /* The early shadow maps everything to a single page of zeroes */ @@ -155,14 +155,14 @@ static void __init kasan_map_populate(unsigned long start, unsigned long end, */ void __init kasan_copy_shadow(pgd_t *pgdir) { - pgd_t *pgd, *pgd_new, *pgd_end; + pgd_t *pgdp, *pgdp_new, *pgdp_end; - pgd = pgd_offset_k(KASAN_SHADOW_START); - pgd_end = pgd_offset_k(KASAN_SHADOW_END); - pgd_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); + pgdp = pgd_offset_k(KASAN_SHADOW_START); + pgdp_end = pgd_offset_k(KASAN_SHADOW_END); + pgdp_new = pgd_offset_raw(pgdir, KASAN_SHADOW_START); do { - set_pgd(pgd_new, *pgd); - } while (pgd++, pgd_new++, pgd != pgd_end); + set_pgd(pgdp_new, READ_ONCE(*pgdp)); + } while (pgdp++, pgdp_new++, pgdp != pgdp_end); } static void __init clear_pgds(unsigned long start, diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 4694cda823c9..3161b853f29e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -125,45 +125,48 @@ static bool pgattr_change_is_safe(u64 old, u64 new) return ((old ^ new) & ~mask) == 0; } -static void init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, +static void init_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot) { - pte_t *pte; + pte_t *ptep; - pte = pte_set_fixmap_offset(pmd, addr); + ptep = pte_set_fixmap_offset(pmdp, addr); do { - pte_t old_pte = *pte; + pte_t old_pte = READ_ONCE(*ptep); - set_pte(pte, pfn_pte(__phys_to_pfn(phys), prot)); + set_pte(ptep, pfn_pte(__phys_to_pfn(phys), prot)); /* * After the PTE entry has been populated once, we * only allow updates to the permission attributes. */ - BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte))); + BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), + READ_ONCE(pte_val(*ptep)))); phys += PAGE_SIZE; - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (ptep++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); } -static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr, +static void alloc_init_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), int flags) { unsigned long next; + pmd_t pmd = READ_ONCE(*pmdp); - BUG_ON(pmd_sect(*pmd)); - if (pmd_none(*pmd)) { + BUG_ON(pmd_sect(pmd)); + if (pmd_none(pmd)) { phys_addr_t pte_phys; BUG_ON(!pgtable_alloc); pte_phys = pgtable_alloc(); - __pmd_populate(pmd, pte_phys, PMD_TYPE_TABLE); + __pmd_populate(pmdp, pte_phys, PMD_TYPE_TABLE); + pmd = READ_ONCE(*pmdp); } - BUG_ON(pmd_bad(*pmd)); + BUG_ON(pmd_bad(pmd)); do { pgprot_t __prot = prot; @@ -175,67 +178,69 @@ static void alloc_init_cont_pte(pmd_t *pmd, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pte(pmd, addr, next, phys, __prot); + init_pte(pmdp, addr, next, phys, __prot); phys += next - addr; } while (addr = next, addr != end); } -static void init_pmd(pud_t *pud, unsigned long addr, unsigned long end, +static void init_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), int flags) { unsigned long next; - pmd_t *pmd; + pmd_t *pmdp; - pmd = pmd_set_fixmap_offset(pud, addr); + pmdp = pmd_set_fixmap_offset(pudp, addr); do { - pmd_t old_pmd = *pmd; + pmd_t old_pmd = READ_ONCE(*pmdp); next = pmd_addr_end(addr, end); /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && (flags & NO_BLOCK_MAPPINGS) == 0) { - pmd_set_huge(pmd, phys, prot); + pmd_set_huge(pmdp, phys, prot); /* * After the PMD entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), - pmd_val(*pmd))); + READ_ONCE(pmd_val(*pmdp)))); } else { - alloc_init_cont_pte(pmd, addr, next, phys, prot, + alloc_init_cont_pte(pmdp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(pmd_val(old_pmd) != 0 && - pmd_val(old_pmd) != pmd_val(*pmd)); + pmd_val(old_pmd) != READ_ONCE(pmd_val(*pmdp))); } phys += next - addr; - } while (pmd++, addr = next, addr != end); + } while (pmdp++, addr = next, addr != end); pmd_clear_fixmap(); } -static void alloc_init_cont_pmd(pud_t *pud, unsigned long addr, +static void alloc_init_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), int flags) { unsigned long next; + pud_t pud = READ_ONCE(*pudp); /* * Check for initial section mappings in the pgd/pud. */ - BUG_ON(pud_sect(*pud)); - if (pud_none(*pud)) { + BUG_ON(pud_sect(pud)); + if (pud_none(pud)) { phys_addr_t pmd_phys; BUG_ON(!pgtable_alloc); pmd_phys = pgtable_alloc(); - __pud_populate(pud, pmd_phys, PUD_TYPE_TABLE); + __pud_populate(pudp, pmd_phys, PUD_TYPE_TABLE); + pud = READ_ONCE(*pudp); } - BUG_ON(pud_bad(*pud)); + BUG_ON(pud_bad(pud)); do { pgprot_t __prot = prot; @@ -247,7 +252,7 @@ static void alloc_init_cont_pmd(pud_t *pud, unsigned long addr, (flags & NO_CONT_MAPPINGS) == 0) __prot = __pgprot(pgprot_val(prot) | PTE_CONT); - init_pmd(pud, addr, next, phys, __prot, pgtable_alloc, flags); + init_pmd(pudp, addr, next, phys, __prot, pgtable_alloc, flags); phys += next - addr; } while (addr = next, addr != end); @@ -265,25 +270,27 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, return true; } -static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, - phys_addr_t phys, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void), - int flags) +static void alloc_init_pud(pgd_t *pgdp, unsigned long addr, unsigned long end, + phys_addr_t phys, pgprot_t prot, + phys_addr_t (*pgtable_alloc)(void), + int flags) { - pud_t *pud; unsigned long next; + pud_t *pudp; + pgd_t pgd = READ_ONCE(*pgdp); - if (pgd_none(*pgd)) { + if (pgd_none(pgd)) { phys_addr_t pud_phys; BUG_ON(!pgtable_alloc); pud_phys = pgtable_alloc(); - __pgd_populate(pgd, pud_phys, PUD_TYPE_TABLE); + __pgd_populate(pgdp, pud_phys, PUD_TYPE_TABLE); + pgd = READ_ONCE(*pgdp); } - BUG_ON(pgd_bad(*pgd)); + BUG_ON(pgd_bad(pgd)); - pud = pud_set_fixmap_offset(pgd, addr); + pudp = pud_set_fixmap_offset(pgdp, addr); do { - pud_t old_pud = *pud; + pud_t old_pud = READ_ONCE(*pudp); next = pud_addr_end(addr, end); @@ -292,23 +299,23 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, */ if (use_1G_block(addr, next, phys) && (flags & NO_BLOCK_MAPPINGS) == 0) { - pud_set_huge(pud, phys, prot); + pud_set_huge(pudp, phys, prot); /* * After the PUD entry has been populated once, we * only allow updates to the permission attributes. */ BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), - pud_val(*pud))); + READ_ONCE(pud_val(*pudp)))); } else { - alloc_init_cont_pmd(pud, addr, next, phys, prot, + alloc_init_cont_pmd(pudp, addr, next, phys, prot, pgtable_alloc, flags); BUG_ON(pud_val(old_pud) != 0 && - pud_val(old_pud) != pud_val(*pud)); + pud_val(old_pud) != READ_ONCE(pud_val(*pudp))); } phys += next - addr; - } while (pud++, addr = next, addr != end); + } while (pudp++, addr = next, addr != end); pud_clear_fixmap(); } @@ -320,7 +327,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, int flags) { unsigned long addr, length, end, next; - pgd_t *pgd = pgd_offset_raw(pgdir, virt); + pgd_t *pgdp = pgd_offset_raw(pgdir, virt); /* * If the virtual and physical address don't have the same offset @@ -336,10 +343,10 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, end = addr + length; do { next = pgd_addr_end(addr, end); - alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, + alloc_init_pud(pgdp, addr, next, phys, prot, pgtable_alloc, flags); phys += next - addr; - } while (pgd++, addr = next, addr != end); + } while (pgdp++, addr = next, addr != end); } static phys_addr_t pgd_pgtable_alloc(void) @@ -401,10 +408,10 @@ static void update_mapping_prot(phys_addr_t phys, unsigned long virt, flush_tlb_kernel_range(virt, virt + size); } -static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, +static void __init __map_memblock(pgd_t *pgdp, phys_addr_t start, phys_addr_t end, pgprot_t prot, int flags) { - __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, + __create_pgd_mapping(pgdp, start, __phys_to_virt(start), end - start, prot, early_pgtable_alloc, flags); } @@ -418,7 +425,7 @@ void __init mark_linear_text_alias_ro(void) PAGE_KERNEL_RO); } -static void __init map_mem(pgd_t *pgd) +static void __init map_mem(pgd_t *pgdp) { phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); @@ -451,7 +458,7 @@ static void __init map_mem(pgd_t *pgd) if (memblock_is_nomap(reg)) continue; - __map_memblock(pgd, start, end, PAGE_KERNEL, flags); + __map_memblock(pgdp, start, end, PAGE_KERNEL, flags); } /* @@ -464,7 +471,7 @@ static void __init map_mem(pgd_t *pgd) * Note that contiguous mappings cannot be remapped in this way, * so we should avoid them here. */ - __map_memblock(pgd, kernel_start, kernel_end, + __map_memblock(pgdp, kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); @@ -475,7 +482,7 @@ static void __init map_mem(pgd_t *pgd) * through /sys/kernel/kexec_crash_size interface. */ if (crashk_res.end) { - __map_memblock(pgd, crashk_res.start, crashk_res.end + 1, + __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1, PAGE_KERNEL, NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); memblock_clear_nomap(crashk_res.start, @@ -499,7 +506,7 @@ void mark_rodata_ro(void) debug_checkwx(); } -static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, +static void __init map_kernel_segment(pgd_t *pgdp, void *va_start, void *va_end, pgprot_t prot, struct vm_struct *vma, int flags, unsigned long vm_flags) { @@ -509,7 +516,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(pa_start)); BUG_ON(!PAGE_ALIGNED(size)); - __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, + __create_pgd_mapping(pgdp, pa_start, (unsigned long)va_start, size, prot, early_pgtable_alloc, flags); if (!(vm_flags & VM_NO_GUARD)) @@ -562,7 +569,7 @@ core_initcall(map_entry_trampoline); /* * Create fine-grained mappings for the kernel. */ -static void __init map_kernel(pgd_t *pgd) +static void __init map_kernel(pgd_t *pgdp) { static struct vm_struct vmlinux_text, vmlinux_rodata, vmlinux_inittext, vmlinux_initdata, vmlinux_data; @@ -578,24 +585,24 @@ static void __init map_kernel(pgd_t *pgd) * Only rodata will be remapped with different permissions later on, * all other segments are allowed to use contiguous mappings. */ - map_kernel_segment(pgd, _text, _etext, text_prot, &vmlinux_text, 0, + map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0, VM_NO_GUARD); - map_kernel_segment(pgd, __start_rodata, __inittext_begin, PAGE_KERNEL, + map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL, &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD); - map_kernel_segment(pgd, __inittext_begin, __inittext_end, text_prot, + map_kernel_segment(pgdp, __inittext_begin, __inittext_end, text_prot, &vmlinux_inittext, 0, VM_NO_GUARD); - map_kernel_segment(pgd, __initdata_begin, __initdata_end, PAGE_KERNEL, + map_kernel_segment(pgdp, __initdata_begin, __initdata_end, PAGE_KERNEL, &vmlinux_initdata, 0, VM_NO_GUARD); - map_kernel_segment(pgd, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0); + map_kernel_segment(pgdp, _data, _end, PAGE_KERNEL, &vmlinux_data, 0, 0); - if (!pgd_val(*pgd_offset_raw(pgd, FIXADDR_START))) { + if (!READ_ONCE(pgd_val(*pgd_offset_raw(pgdp, FIXADDR_START)))) { /* * The fixmap falls in a separate pgd to the kernel, and doesn't * live in the carveout for the swapper_pg_dir. We can simply * re-use the existing dir for the fixmap. */ - set_pgd(pgd_offset_raw(pgd, FIXADDR_START), - *pgd_offset_k(FIXADDR_START)); + set_pgd(pgd_offset_raw(pgdp, FIXADDR_START), + READ_ONCE(*pgd_offset_k(FIXADDR_START))); } else if (CONFIG_PGTABLE_LEVELS > 3) { /* * The fixmap shares its top level pgd entry with the kernel @@ -604,14 +611,15 @@ static void __init map_kernel(pgd_t *pgd) * entry instead. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - pud_populate(&init_mm, pud_set_fixmap_offset(pgd, FIXADDR_START), + pud_populate(&init_mm, + pud_set_fixmap_offset(pgdp, FIXADDR_START), lm_alias(bm_pmd)); pud_clear_fixmap(); } else { BUG(); } - kasan_copy_shadow(pgd); + kasan_copy_shadow(pgdp); } /* @@ -621,10 +629,10 @@ static void __init map_kernel(pgd_t *pgd) void __init paging_init(void) { phys_addr_t pgd_phys = early_pgtable_alloc(); - pgd_t *pgd = pgd_set_fixmap(pgd_phys); + pgd_t *pgdp = pgd_set_fixmap(pgd_phys); - map_kernel(pgd); - map_mem(pgd); + map_kernel(pgdp); + map_mem(pgdp); /* * We want to reuse the original swapper_pg_dir so we don't have to @@ -635,7 +643,7 @@ void __init paging_init(void) * To do this we need to go via a temporary pgd. */ cpu_replace_ttbr1(__va(pgd_phys)); - memcpy(swapper_pg_dir, pgd, PGD_SIZE); + memcpy(swapper_pg_dir, pgdp, PGD_SIZE); cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); pgd_clear_fixmap(); @@ -655,37 +663,40 @@ void __init paging_init(void) */ int kern_addr_valid(unsigned long addr) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgdp; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep, pte; if ((((long)addr) >> VA_BITS) != -1UL) return 0; - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) + pgdp = pgd_offset_k(addr); + if (pgd_none(READ_ONCE(*pgdp))) return 0; - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) + pudp = pud_offset(pgdp, addr); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) return 0; - if (pud_sect(*pud)) - return pfn_valid(pud_pfn(*pud)); + if (pud_sect(pud)) + return pfn_valid(pud_pfn(pud)); - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + pmdp = pmd_offset(pudp, addr); + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) return 0; - if (pmd_sect(*pmd)) - return pfn_valid(pmd_pfn(*pmd)); + if (pmd_sect(pmd)) + return pfn_valid(pmd_pfn(pmd)); - pte = pte_offset_kernel(pmd, addr); - if (pte_none(*pte)) + ptep = pte_offset_kernel(pmdp, addr); + pte = READ_ONCE(*ptep); + if (pte_none(pte)) return 0; - return pfn_valid(pte_pfn(*pte)); + return pfn_valid(pte_pfn(pte)); } #ifdef CONFIG_SPARSEMEM_VMEMMAP #if !ARM64_SWAPPER_USES_SECTION_MAPS @@ -700,32 +711,32 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, { unsigned long addr = start; unsigned long next; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; + pgd_t *pgdp; + pud_t *pudp; + pmd_t *pmdp; do { next = pmd_addr_end(addr, end); - pgd = vmemmap_pgd_populate(addr, node); - if (!pgd) + pgdp = vmemmap_pgd_populate(addr, node); + if (!pgdp) return -ENOMEM; - pud = vmemmap_pud_populate(pgd, addr, node); - if (!pud) + pudp = vmemmap_pud_populate(pgdp, addr, node); + if (!pudp) return -ENOMEM; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) { + pmdp = pmd_offset(pudp, addr); + if (pmd_none(READ_ONCE(*pmdp))) { void *p = NULL; p = vmemmap_alloc_block_buf(PMD_SIZE, node); if (!p) return -ENOMEM; - pmd_set_huge(pmd, __pa(p), __pgprot(PROT_SECT_NORMAL)); + pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); } else - vmemmap_verify((pte_t *)pmd, node, addr, next); + vmemmap_verify((pte_t *)pmdp, node, addr, next); } while (addr = next, addr != end); return 0; @@ -739,20 +750,22 @@ void vmemmap_free(unsigned long start, unsigned long end, static inline pud_t * fixmap_pud(unsigned long addr) { - pgd_t *pgd = pgd_offset_k(addr); + pgd_t *pgdp = pgd_offset_k(addr); + pgd_t pgd = READ_ONCE(*pgdp); - BUG_ON(pgd_none(*pgd) || pgd_bad(*pgd)); + BUG_ON(pgd_none(pgd) || pgd_bad(pgd)); - return pud_offset_kimg(pgd, addr); + return pud_offset_kimg(pgdp, addr); } static inline pmd_t * fixmap_pmd(unsigned long addr) { - pud_t *pud = fixmap_pud(addr); + pud_t *pudp = fixmap_pud(addr); + pud_t pud = READ_ONCE(*pudp); - BUG_ON(pud_none(*pud) || pud_bad(*pud)); + BUG_ON(pud_none(pud) || pud_bad(pud)); - return pmd_offset_kimg(pud, addr); + return pmd_offset_kimg(pudp, addr); } static inline pte_t * fixmap_pte(unsigned long addr) @@ -768,30 +781,31 @@ static inline pte_t * fixmap_pte(unsigned long addr) */ void __init early_fixmap_init(void) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; + pgd_t *pgdp, pgd; + pud_t *pudp; + pmd_t *pmdp; unsigned long addr = FIXADDR_START; - pgd = pgd_offset_k(addr); + pgdp = pgd_offset_k(addr); + pgd = READ_ONCE(*pgdp); if (CONFIG_PGTABLE_LEVELS > 3 && - !(pgd_none(*pgd) || pgd_page_paddr(*pgd) == __pa_symbol(bm_pud))) { + !(pgd_none(pgd) || pgd_page_paddr(pgd) == __pa_symbol(bm_pud))) { /* * We only end up here if the kernel mapping and the fixmap * share the top level pgd entry, which should only happen on * 16k/4 levels configurations. */ BUG_ON(!IS_ENABLED(CONFIG_ARM64_16K_PAGES)); - pud = pud_offset_kimg(pgd, addr); + pudp = pud_offset_kimg(pgdp, addr); } else { - if (pgd_none(*pgd)) - __pgd_populate(pgd, __pa_symbol(bm_pud), PUD_TYPE_TABLE); - pud = fixmap_pud(addr); + if (pgd_none(pgd)) + __pgd_populate(pgdp, __pa_symbol(bm_pud), PUD_TYPE_TABLE); + pudp = fixmap_pud(addr); } - if (pud_none(*pud)) - __pud_populate(pud, __pa_symbol(bm_pmd), PMD_TYPE_TABLE); - pmd = fixmap_pmd(addr); - __pmd_populate(pmd, __pa_symbol(bm_pte), PMD_TYPE_TABLE); + if (pud_none(READ_ONCE(*pudp))) + __pud_populate(pudp, __pa_symbol(bm_pmd), PMD_TYPE_TABLE); + pmdp = fixmap_pmd(addr); + __pmd_populate(pmdp, __pa_symbol(bm_pte), PMD_TYPE_TABLE); /* * The boot-ioremap range spans multiple pmds, for which @@ -800,11 +814,11 @@ void __init early_fixmap_init(void) BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT) != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT)); - if ((pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN))) - || pmd != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) { + if ((pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN))) + || pmdp != fixmap_pmd(fix_to_virt(FIX_BTMAP_END))) { WARN_ON(1); - pr_warn("pmd %p != %p, %p\n", - pmd, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)), + pr_warn("pmdp %p != %p, %p\n", + pmdp, fixmap_pmd(fix_to_virt(FIX_BTMAP_BEGIN)), fixmap_pmd(fix_to_virt(FIX_BTMAP_END))); pr_warn("fix_to_virt(FIX_BTMAP_BEGIN): %08lx\n", fix_to_virt(FIX_BTMAP_BEGIN)); @@ -824,16 +838,16 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags) { unsigned long addr = __fix_to_virt(idx); - pte_t *pte; + pte_t *ptep; BUG_ON(idx <= FIX_HOLE || idx >= __end_of_fixed_addresses); - pte = fixmap_pte(addr); + ptep = fixmap_pte(addr); if (pgprot_val(flags)) { - set_pte(pte, pfn_pte(phys >> PAGE_SHIFT, flags)); + set_pte(ptep, pfn_pte(phys >> PAGE_SHIFT, flags)); } else { - pte_clear(&init_mm, addr, pte); + pte_clear(&init_mm, addr, ptep); flush_tlb_kernel_range(addr, addr+PAGE_SIZE); } } @@ -915,36 +929,36 @@ int __init arch_ioremap_pmd_supported(void) return 1; } -int pud_set_huge(pud_t *pud, phys_addr_t phys, pgprot_t prot) +int pud_set_huge(pud_t *pudp, phys_addr_t phys, pgprot_t prot) { pgprot_t sect_prot = __pgprot(PUD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))); BUG_ON(phys & ~PUD_MASK); - set_pud(pud, pfn_pud(__phys_to_pfn(phys), sect_prot)); + set_pud(pudp, pfn_pud(__phys_to_pfn(phys), sect_prot)); return 1; } -int pmd_set_huge(pmd_t *pmd, phys_addr_t phys, pgprot_t prot) +int pmd_set_huge(pmd_t *pmdp, phys_addr_t phys, pgprot_t prot) { pgprot_t sect_prot = __pgprot(PMD_TYPE_SECT | pgprot_val(mk_sect_prot(prot))); BUG_ON(phys & ~PMD_MASK); - set_pmd(pmd, pfn_pmd(__phys_to_pfn(phys), sect_prot)); + set_pmd(pmdp, pfn_pmd(__phys_to_pfn(phys), sect_prot)); return 1; } -int pud_clear_huge(pud_t *pud) +int pud_clear_huge(pud_t *pudp) { - if (!pud_sect(*pud)) + if (!pud_sect(READ_ONCE(*pudp))) return 0; - pud_clear(pud); + pud_clear(pudp); return 1; } -int pmd_clear_huge(pmd_t *pmd) +int pmd_clear_huge(pmd_t *pmdp) { - if (!pmd_sect(*pmd)) + if (!pmd_sect(READ_ONCE(*pmdp))) return 0; - pmd_clear(pmd); + pmd_clear(pmdp); return 1; } diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index a682a0a2a0fa..a56359373d8b 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -29,7 +29,7 @@ static int change_page_range(pte_t *ptep, pgtable_t token, unsigned long addr, void *data) { struct page_change_data *cdata = data; - pte_t pte = *ptep; + pte_t pte = READ_ONCE(*ptep); pte = clear_pte_bit(pte, cdata->clear_mask); pte = set_pte_bit(pte, cdata->set_mask); @@ -156,30 +156,32 @@ void __kernel_map_pages(struct page *page, int numpages, int enable) */ bool kernel_page_present(struct page *page) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; + pgd_t *pgdp; + pud_t *pudp, pud; + pmd_t *pmdp, pmd; + pte_t *ptep; unsigned long addr = (unsigned long)page_address(page); - pgd = pgd_offset_k(addr); - if (pgd_none(*pgd)) + pgdp = pgd_offset_k(addr); + if (pgd_none(READ_ONCE(*pgdp))) return false; - pud = pud_offset(pgd, addr); - if (pud_none(*pud)) + pudp = pud_offset(pgdp, addr); + pud = READ_ONCE(*pudp); + if (pud_none(pud)) return false; - if (pud_sect(*pud)) + if (pud_sect(pud)) return true; - pmd = pmd_offset(pud, addr); - if (pmd_none(*pmd)) + pmdp = pmd_offset(pudp, addr); + pmd = READ_ONCE(*pmdp); + if (pmd_none(pmd)) return false; - if (pmd_sect(*pmd)) + if (pmd_sect(pmd)) return true; - pte = pte_offset_kernel(pmd, addr); - return pte_valid(*pte); + ptep = pte_offset_kernel(pmdp, addr); + return pte_valid(READ_ONCE(*ptep)); } #endif /* CONFIG_HIBERNATION */ #endif /* CONFIG_DEBUG_PAGEALLOC */ From e1a50de37860b3a93a9d643b09638db5aff47650 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Fri, 16 Feb 2018 17:04:23 +0000 Subject: [PATCH 144/149] arm64: cputype: Silence Sparse warnings Sparse makes a fair bit of noise about our MPIDR mask being implicitly long - let's explicitly describe it as such rather than just relying on the value forcing automatic promotion. Signed-off-by: Robin Murphy Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/cputype.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index be7bd19c87ec..eda8c5f629fc 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -20,7 +20,7 @@ #define MPIDR_UP_BITMASK (0x1 << 30) #define MPIDR_MT_BITMASK (0x1 << 24) -#define MPIDR_HWID_BITMASK 0xff00ffffff +#define MPIDR_HWID_BITMASK 0xff00ffffffUL #define MPIDR_LEVEL_BITS_SHIFT 3 #define MPIDR_LEVEL_BITS (1 << MPIDR_LEVEL_BITS_SHIFT) From 29fee6eed2811ff1089b30fc579a2d19d78016ab Mon Sep 17 00:00:00 2001 From: Joao Martins Date: Fri, 2 Feb 2018 17:42:33 +0000 Subject: [PATCH 145/149] xenbus: track caller request id Commit fd8aa9095a95 ("xen: optimize xenbus driver for multiple concurrent xenstore accesses") optimized xenbus concurrent accesses but in doing so broke UABI of /dev/xen/xenbus. Through /dev/xen/xenbus applications are in charge of xenbus message exchange with the correct header and body. Now, after the mentioned commit the replies received by application will no longer have the header req_id echoed back as it was on request (see specification below for reference), because that particular field is being overwritten by kernel. struct xsd_sockmsg { uint32_t type; /* XS_??? */ uint32_t req_id;/* Request identifier, echoed in daemon's response. */ uint32_t tx_id; /* Transaction id (0 if not related to a transaction). */ uint32_t len; /* Length of data following this. */ /* Generally followed by nul-terminated string(s). */ }; Before there was only one request at a time so req_id could simply be forwarded back and forth. To allow simultaneous requests we need a different req_id for each message thus kernel keeps a monotonic increasing counter for this field and is written on every request irrespective of userspace value. Forwarding again the req_id on userspace requests is not a solution because we would open the possibility of userspace-generated req_id colliding with kernel ones. So this patch instead takes another route which is to artificially keep user req_id while keeping the xenbus logic as is. We do that by saving the original req_id before xs_send(), use the private kernel counter as req_id and then once reply comes and was validated, we restore back the original req_id. Cc: # 4.11 Fixes: fd8aa9095a ("xen: optimize xenbus driver for multiple concurrent xenstore accesses") Reported-by: Bhavesh Davda Signed-off-by: Joao Martins Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/xenbus/xenbus.h | 1 + drivers/xen/xenbus/xenbus_comms.c | 1 + drivers/xen/xenbus/xenbus_xs.c | 3 +++ 3 files changed, 5 insertions(+) diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 149c5e7efc89..092981171df1 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -76,6 +76,7 @@ struct xb_req_data { struct list_head list; wait_queue_head_t wq; struct xsd_sockmsg msg; + uint32_t caller_req_id; enum xsd_sockmsg_type type; char *body; const struct kvec *vec; diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 5b081a01779d..d239fc3c5e3d 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -309,6 +309,7 @@ static int process_msg(void) goto out; if (req->state == xb_req_state_wait_reply) { + req->msg.req_id = req->caller_req_id; req->msg.type = state.msg.type; req->msg.len = state.msg.len; req->body = state.body; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 3e59590c7254..3f3b29398ab8 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -227,6 +227,8 @@ static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) req->state = xb_req_state_queued; init_waitqueue_head(&req->wq); + /* Save the caller req_id and restore it later in the reply */ + req->caller_req_id = req->msg.req_id; req->msg.req_id = xs_request_enter(req); mutex_lock(&xb_write_mutex); @@ -310,6 +312,7 @@ static void *xs_talkv(struct xenbus_transaction t, req->num_vecs = num_vecs; req->cb = xs_wake_up; + msg.req_id = 0; msg.tx_id = t.id; msg.type = type; msg.len = 0; From 63e708f826bb21470155d37b103a75d8a9e25b18 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 7 Feb 2018 18:49:23 -0500 Subject: [PATCH 146/149] x86/xen: Calculate __max_logical_packages on PV domains The kernel panics on PV domains because native_smp_cpus_done() is only called for HVM domains. Calculate __max_logical_packages for PV domains. Fixes: b4c0a7326f5d ("x86/smpboot: Fix __max_logical_packages estimate") Signed-off-by: Prarit Bhargava Tested-and-reported-by: Simon Gaiser Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: x86@kernel.org Cc: Boris Ostrovsky Cc: Juergen Gross Cc: Dou Liyang Cc: Prarit Bhargava Cc: Kate Stewart Cc: Greg Kroah-Hartman Cc: Andy Lutomirski Cc: Andi Kleen Cc: Vitaly Kuznetsov Cc: xen-devel@lists.xenproject.org Reviewed-by: Boris Ostrovsky Signed-off-by: Juergen Gross --- arch/x86/include/asm/smp.h | 1 + arch/x86/kernel/smpboot.c | 10 ++++++++-- arch/x86/xen/smp.c | 2 ++ 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 461f53d27708..a4189762b266 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -129,6 +129,7 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask) void cpu_disable_common(void); void native_smp_prepare_boot_cpu(void); void native_smp_prepare_cpus(unsigned int max_cpus); +void calculate_max_logical_packages(void); void native_smp_cpus_done(unsigned int max_cpus); void common_cpu_up(unsigned int cpunum, struct task_struct *tidle); int native_cpu_up(unsigned int cpunum, struct task_struct *tidle); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index cfc61e1d45e2..9eee25d07586 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1281,11 +1281,10 @@ void __init native_smp_prepare_boot_cpu(void) cpu_set_state_online(me); } -void __init native_smp_cpus_done(unsigned int max_cpus) +void __init calculate_max_logical_packages(void) { int ncpus; - pr_debug("Boot done\n"); /* * Today neither Intel nor AMD support heterogenous systems so * extrapolate the boot cpu's data to all packages. @@ -1293,6 +1292,13 @@ void __init native_smp_cpus_done(unsigned int max_cpus) ncpus = cpu_data(0).booted_cores * topology_max_smt_threads(); __max_logical_packages = DIV_ROUND_UP(nr_cpu_ids, ncpus); pr_info("Max logical packages: %u\n", __max_logical_packages); +} + +void __init native_smp_cpus_done(unsigned int max_cpus) +{ + pr_debug("Boot done\n"); + + calculate_max_logical_packages(); if (x86_has_numa_in_package) set_sched_topology(x86_numa_in_package_topology); diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 77c959cf81e7..7a43b2ae19f1 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -122,6 +122,8 @@ void __init xen_smp_cpus_done(unsigned int max_cpus) if (xen_hvm_domain()) native_smp_cpus_done(max_cpus); + else + calculate_max_logical_packages(); if (xen_have_vcpu_info_placement) return; From 64d6871827b1e2ac8c9daf49f2c883378c7d50cd Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 14 Feb 2018 10:28:23 -0800 Subject: [PATCH 147/149] pvcalls-front: introduce a per sock_mapping refcount Introduce a per sock_mapping refcount, in addition to the existing global refcount. Thanks to the sock_mapping refcount, we can safely wait for it to be 1 in pvcalls_front_release before freeing an active socket, instead of waiting for the global refcount to be 1. Signed-off-by: Stefano Stabellini Acked-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/pvcalls-front.c | 191 +++++++++++++++--------------------- 1 file changed, 79 insertions(+), 112 deletions(-) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 753d9cb437d0..11ce470b41a5 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -60,6 +60,7 @@ struct sock_mapping { bool active_socket; struct list_head list; struct socket *sock; + atomic_t refcount; union { struct { int irq; @@ -93,6 +94,32 @@ struct sock_mapping { }; }; +static inline struct sock_mapping *pvcalls_enter_sock(struct socket *sock) +{ + struct sock_mapping *map; + + if (!pvcalls_front_dev || + dev_get_drvdata(&pvcalls_front_dev->dev) == NULL) + return ERR_PTR(-ENOTCONN); + + map = (struct sock_mapping *)sock->sk->sk_send_head; + if (map == NULL) + return ERR_PTR(-ENOTSOCK); + + pvcalls_enter(); + atomic_inc(&map->refcount); + return map; +} + +static inline void pvcalls_exit_sock(struct socket *sock) +{ + struct sock_mapping *map; + + map = (struct sock_mapping *)sock->sk->sk_send_head; + atomic_dec(&map->refcount); + pvcalls_exit(); +} + static inline int get_request(struct pvcalls_bedata *bedata, int *req_id) { *req_id = bedata->ring.req_prod_pvt & (RING_SIZE(&bedata->ring) - 1); @@ -369,31 +396,23 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *)sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - spin_lock(&bedata->socket_lock); ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } ret = create_active(map, &evtchn); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -423,7 +442,7 @@ int pvcalls_front_connect(struct socket *sock, struct sockaddr *addr, smp_rmb(); ret = bedata->rsp[req_id].ret; bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -488,23 +507,15 @@ int pvcalls_front_sendmsg(struct socket *sock, struct msghdr *msg, if (flags & (MSG_CONFIRM|MSG_DONTROUTE|MSG_EOR|MSG_OOB)) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - mutex_lock(&map->active.out_mutex); if ((flags & MSG_DONTWAIT) && !pvcalls_front_write_todo(map)) { mutex_unlock(&map->active.out_mutex); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EAGAIN; } if (len > INT_MAX) @@ -526,7 +537,7 @@ again: tot_sent = sent; mutex_unlock(&map->active.out_mutex); - pvcalls_exit(); + pvcalls_exit_sock(sock); return tot_sent; } @@ -591,19 +602,11 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (flags & (MSG_CMSG_CLOEXEC|MSG_ERRQUEUE|MSG_OOB|MSG_TRUNC)) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - mutex_lock(&map->active.in_mutex); if (len > XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER)) len = XEN_FLEX_RING_SIZE(PVCALLS_RING_ORDER); @@ -623,7 +626,7 @@ int pvcalls_front_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, ret = 0; mutex_unlock(&map->active.in_mutex); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -637,24 +640,16 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) if (addr->sa_family != AF_INET || sock->type != SOCK_STREAM) return -EOPNOTSUPP; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (map == NULL) { - pvcalls_exit(); - return -ENOTSOCK; - } - spin_lock(&bedata->socket_lock); ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } req = RING_GET_REQUEST(&bedata->ring, req_id); @@ -684,7 +679,7 @@ int pvcalls_front_bind(struct socket *sock, struct sockaddr *addr, int addr_len) bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; map->passive.status = PVCALLS_STATUS_BIND; - pvcalls_exit(); + pvcalls_exit_sock(sock); return 0; } @@ -695,21 +690,13 @@ int pvcalls_front_listen(struct socket *sock, int backlog) struct xen_pvcalls_request *req; int notify, req_id, ret; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - if (map->passive.status != PVCALLS_STATUS_BIND) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EOPNOTSUPP; } @@ -717,7 +704,7 @@ int pvcalls_front_listen(struct socket *sock, int backlog) ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } req = RING_GET_REQUEST(&bedata->ring, req_id); @@ -741,7 +728,7 @@ int pvcalls_front_listen(struct socket *sock, int backlog) bedata->rsp[req_id].req_id = PVCALLS_INVALID_ID; map->passive.status = PVCALLS_STATUS_LISTEN; - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -753,21 +740,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) struct xen_pvcalls_request *req; int notify, req_id, ret, evtchn, nonblock; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -ENOTCONN; - } + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) + return PTR_ERR(map); bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return -ENOTSOCK; - } - if (map->passive.status != PVCALLS_STATUS_LISTEN) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EINVAL; } @@ -785,13 +764,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) goto received; } if (nonblock) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EAGAIN; } if (wait_event_interruptible(map->passive.inflight_accept_req, !test_and_set_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags))) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EINTR; } } @@ -802,7 +781,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } map2 = kzalloc(sizeof(*map2), GFP_ATOMIC); @@ -810,7 +789,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -ENOMEM; } ret = create_active(map2, &evtchn); @@ -819,7 +798,7 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } list_add_tail(&map2->list, &bedata->socket_mappings); @@ -841,13 +820,13 @@ int pvcalls_front_accept(struct socket *sock, struct socket *newsock, int flags) /* We could check if we have received a response before returning. */ if (nonblock) { WRITE_ONCE(map->passive.inflight_req_id, req_id); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EAGAIN; } if (wait_event_interruptible(bedata->inflight_req, READ_ONCE(bedata->rsp[req_id].req_id) == req_id)) { - pvcalls_exit(); + pvcalls_exit_sock(sock); return -EINTR; } /* read req_id, then the content */ @@ -862,7 +841,7 @@ received: clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); pvcalls_front_free_map(bedata, map2); - pvcalls_exit(); + pvcalls_exit_sock(sock); return -ENOMEM; } newsock->sk->sk_send_head = (void *)map2; @@ -874,7 +853,7 @@ received: clear_bit(PVCALLS_FLAG_ACCEPT_INFLIGHT, (void *)&map->passive.flags); wake_up(&map->passive.inflight_accept_req); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -965,23 +944,16 @@ __poll_t pvcalls_front_poll(struct file *file, struct socket *sock, struct sock_mapping *map; __poll_t ret; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) return EPOLLNVAL; - } bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (!map) { - pvcalls_exit(); - return EPOLLNVAL; - } if (map->active_socket) ret = pvcalls_front_poll_active(file, bedata, map, wait); else ret = pvcalls_front_poll_passive(file, bedata, map, wait); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } @@ -995,25 +967,20 @@ int pvcalls_front_release(struct socket *sock) if (sock->sk == NULL) return 0; - pvcalls_enter(); - if (!pvcalls_front_dev) { - pvcalls_exit(); - return -EIO; + map = pvcalls_enter_sock(sock); + if (IS_ERR(map)) { + if (PTR_ERR(map) == -ENOTCONN) + return -EIO; + else + return 0; } - bedata = dev_get_drvdata(&pvcalls_front_dev->dev); - map = (struct sock_mapping *) sock->sk->sk_send_head; - if (map == NULL) { - pvcalls_exit(); - return 0; - } - spin_lock(&bedata->socket_lock); ret = get_request(bedata, &req_id); if (ret < 0) { spin_unlock(&bedata->socket_lock); - pvcalls_exit(); + pvcalls_exit_sock(sock); return ret; } sock->sk->sk_send_head = NULL; @@ -1043,10 +1010,10 @@ int pvcalls_front_release(struct socket *sock) /* * We need to make sure that sendmsg/recvmsg on this socket have * not started before we've cleared sk_send_head here. The - * easiest (though not optimal) way to guarantee this is to see - * that no pvcall (other than us) is in progress. + * easiest way to guarantee this is to see that no pvcalls + * (other than us) is in progress on this socket. */ - while (atomic_read(&pvcalls_refcount) > 1) + while (atomic_read(&map->refcount) > 1) cpu_relax(); pvcalls_front_free_map(bedata, map); From d1a75e0896f5e9f5cb6a979caaea39f1f4b9feb1 Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Wed, 14 Feb 2018 10:28:24 -0800 Subject: [PATCH 148/149] pvcalls-front: wait for other operations to return when release passive sockets Passive sockets can have ongoing operations on them, specifically, we have two wait_event_interruptable calls in pvcalls_front_accept. Add two wake_up calls in pvcalls_front_release, then wait for the potential waiters to return and release the sock_mapping refcount. Signed-off-by: Stefano Stabellini Acked-by: Juergen Gross Signed-off-by: Juergen Gross --- drivers/xen/pvcalls-front.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/xen/pvcalls-front.c b/drivers/xen/pvcalls-front.c index 11ce470b41a5..aedbee3b2838 100644 --- a/drivers/xen/pvcalls-front.c +++ b/drivers/xen/pvcalls-front.c @@ -1018,6 +1018,12 @@ int pvcalls_front_release(struct socket *sock) pvcalls_front_free_map(bedata, map); } else { + wake_up(&bedata->inflight_req); + wake_up(&map->passive.inflight_accept_req); + + while (atomic_read(&map->refcount) > 1) + cpu_relax(); + spin_lock(&bedata->socket_lock); list_del(&map->list); spin_unlock(&bedata->socket_lock); From 91ab883eb21325ad80f3473633f794c78ac87f51 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 18 Feb 2018 17:29:42 -0800 Subject: [PATCH 149/149] Linux 4.16-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 79ad2bfa24b6..d9cf3a40eda9 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 4 PATCHLEVEL = 16 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Fearless Coyote # *DOCUMENTATION*