From e789ca0cc1d51296832b8424fa4008ce6e9d1703 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:37 +0100 Subject: [PATCH 001/241] ext4: combine ext4_handle_error() and save_error_info() save_error_info() is always called together with ext4_handle_error(). Combine them into a single call and move unconditional bits out of save_error_info() into ext4_handle_error(). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-2-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 4bbfb05aae58..cdf2a377d884 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -592,9 +592,6 @@ static void __save_error_info(struct super_block *sb, int error, { struct ext4_sb_info *sbi = EXT4_SB(sb); - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - if (bdev_read_only(sb->s_bdev)) - return; /* We default to EFSCORRUPTED error... */ if (error == 0) error = EFSCORRUPTED; @@ -647,13 +644,19 @@ static void save_error_info(struct super_block *sb, int error, * used to deal with unrecoverable failures such as journal IO errors or ENOMEM * at a critical moment in log management. */ -static void ext4_handle_error(struct super_block *sb, bool force_ro) +static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); + if (!bdev_read_only(sb->s_bdev)) + save_error_info(sb, error, ino, block, func, line); + if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT))) return; @@ -710,8 +713,7 @@ void __ext4_error(struct super_block *sb, const char *function, sb->s_id, function, line, current->comm, &vaf); va_end(args); } - save_error_info(sb, error, 0, block, function, line); - ext4_handle_error(sb, force_ro); + ext4_handle_error(sb, force_ro, error, 0, block, function, line); } void __ext4_error_inode(struct inode *inode, const char *function, @@ -741,9 +743,8 @@ void __ext4_error_inode(struct inode *inode, const char *function, current->comm, &vaf); va_end(args); } - save_error_info(inode->i_sb, error, inode->i_ino, block, - function, line); - ext4_handle_error(inode->i_sb, false); + ext4_handle_error(inode->i_sb, false, error, inode->i_ino, block, + function, line); } void __ext4_error_file(struct file *file, const char *function, @@ -780,9 +781,8 @@ void __ext4_error_file(struct file *file, const char *function, current->comm, path, &vaf); va_end(args); } - save_error_info(inode->i_sb, EFSCORRUPTED, inode->i_ino, block, - function, line); - ext4_handle_error(inode->i_sb, false); + ext4_handle_error(inode->i_sb, false, EFSCORRUPTED, inode->i_ino, block, + function, line); } const char *ext4_decode_error(struct super_block *sb, int errno, @@ -849,8 +849,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, sb->s_id, function, line, errstr); } - save_error_info(sb, -errno, 0, 0, function, line); - ext4_handle_error(sb, false); + ext4_handle_error(sb, false, -errno, 0, 0, function, line); } void __ext4_msg(struct super_block *sb, @@ -944,13 +943,14 @@ __acquires(bitlock) if (test_opt(sb, ERRORS_CONT)) { if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - schedule_work(&EXT4_SB(sb)->s_error_work); + if (!bdev_read_only(sb->s_bdev)) + schedule_work(&EXT4_SB(sb)->s_error_work); return; } ext4_unlock_group(sb, grp); - save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - ext4_handle_error(sb, false); + ext4_handle_error(sb, false, EFSCORRUPTED, ino, block, function, line); /* * We only get here in the ERRORS_RO case; relocking the group * may be dangerous, but nothing bad will happen since the From 4392fbc4bab57db3760f0fb61258cb7089b37665 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:38 +0100 Subject: [PATCH 002/241] ext4: drop sync argument of ext4_commit_super() Everybody passes 1 as sync argument of ext4_commit_super(). Just drop it. Reviewed-by: Harshad Shirwadkar Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 47 ++++++++++++++++++++++------------------------- 1 file changed, 22 insertions(+), 25 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index cdf2a377d884..8bf31003416a 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -65,7 +65,7 @@ static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); -static int ext4_commit_super(struct super_block *sb, int sync); +static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static int ext4_clear_journal_err(struct super_block *sb, @@ -621,7 +621,7 @@ static void save_error_info(struct super_block *sb, int error, { __save_error_info(sb, error, ino, block, func, line); if (!bdev_read_only(sb->s_bdev)) - ext4_commit_super(sb, 1); + ext4_commit_super(sb); } /* Deal with the reporting of failure conditions on a filesystem such as @@ -686,7 +686,7 @@ static void flush_stashed_error_work(struct work_struct *work) struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_error_work); - ext4_commit_super(sbi->s_sb, 1); + ext4_commit_super(sbi->s_sb); } #define ext4_error_ratelimit(sb) \ @@ -1152,7 +1152,7 @@ static void ext4_put_super(struct super_block *sb) es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) - ext4_commit_super(sb, 1); + ext4_commit_super(sb); rcu_read_lock(); group_desc = rcu_dereference(sbi->s_group_desc); @@ -2642,7 +2642,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) ext4_set_feature_journal_needs_recovery(sb); - err = ext4_commit_super(sb, 1); + err = ext4_commit_super(sb); done: if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " @@ -4869,7 +4869,7 @@ no_journal: if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) && !ext4_has_feature_encrypt(sb)) { ext4_set_feature_encrypt(sb); - ext4_commit_super(sb, 1); + ext4_commit_super(sb); } /* @@ -5424,7 +5424,7 @@ static int ext4_load_journal(struct super_block *sb, es->s_journal_dev = cpu_to_le32(journal_devnum); /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, 1); + ext4_commit_super(sb); } return 0; @@ -5434,7 +5434,7 @@ err_out: return err; } -static int ext4_commit_super(struct super_block *sb, int sync) +static int ext4_commit_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -5515,8 +5515,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); - if (sync) - lock_buffer(sbh); + lock_buffer(sbh); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -5532,16 +5531,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) set_buffer_uptodate(sbh); } mark_buffer_dirty(sbh); - if (sync) { - unlock_buffer(sbh); - error = __sync_dirty_buffer(sbh, - REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); - if (buffer_write_io_error(sbh)) { - ext4_msg(sb, KERN_ERR, "I/O error while writing " - "superblock"); - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - } + unlock_buffer(sbh); + error = __sync_dirty_buffer(sbh, + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); + if (buffer_write_io_error(sbh)) { + ext4_msg(sb, KERN_ERR, "I/O error while writing " + "superblock"); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); } return error; } @@ -5572,7 +5569,7 @@ static int ext4_mark_recovery_complete(struct super_block *sb, if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { ext4_clear_feature_journal_needs_recovery(sb); - ext4_commit_super(sb, 1); + ext4_commit_super(sb); } out: jbd2_journal_unlock_updates(journal); @@ -5614,7 +5611,7 @@ static int ext4_clear_journal_err(struct super_block *sb, EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, 1); + ext4_commit_super(sb); jbd2_journal_clear_err(journal); jbd2_journal_update_sb_errno(journal); @@ -5716,7 +5713,7 @@ static int ext4_freeze(struct super_block *sb) ext4_clear_feature_journal_needs_recovery(sb); } - error = ext4_commit_super(sb, 1); + error = ext4_commit_super(sb); out: if (journal) /* we rely on upper layer to stop further updates */ @@ -5738,7 +5735,7 @@ static int ext4_unfreeze(struct super_block *sb) ext4_set_feature_journal_needs_recovery(sb); } - ext4_commit_super(sb, 1); + ext4_commit_super(sb); return 0; } @@ -5998,7 +5995,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (sbi->s_journal == NULL && !(old_sb_flags & SB_RDONLY)) { - err = ext4_commit_super(sb, 1); + err = ext4_commit_super(sb); if (err) goto restore_opts; } From 05c2c00f3769abb9e323fcaca70d2de0b48af7ba Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:39 +0100 Subject: [PATCH 003/241] ext4: protect superblock modifications with a buffer lock Protect all superblock modifications (including checksum computation) with a superblock buffer lock. That way we are sure computed checksum matches current superblock contents (a mismatch could cause checksum failures in nojournal mode or if an unjournalled superblock update races with a journalled one). Also we avoid modifying superblock contents while it is being written out (which can cause DIF/DIX failures if we are running in nojournal mode). Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-4-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 1 - fs/ext4/file.c | 3 +++ fs/ext4/inode.c | 3 +++ fs/ext4/namei.c | 6 ++++++ fs/ext4/resize.c | 12 ++++++++++++ fs/ext4/super.c | 2 +- fs/ext4/xattr.c | 3 +++ 7 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 1a0a827a7f34..c7e410c4ab7d 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -379,7 +379,6 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, struct buffer_head *bh = EXT4_SB(sb)->s_sbh; int err = 0; - ext4_superblock_csum_set(sb); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3ed8c048fb12..26907d5835d0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -809,8 +809,11 @@ static int ext4_sample_last_mounted(struct super_block *sb, err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto out_journal; + lock_buffer(sbi->s_sbh); strlcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); ext4_handle_dirty_super(handle, sb); out_journal: ext4_journal_stop(handle); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 27946882d4ce..27de6f0a33c8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5150,7 +5150,10 @@ static int ext4_do_update_inode(handle_t *handle, err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); if (err) goto out_brelse; + lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_large_file(sb); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); err = ext4_handle_dirty_super(handle, sb); } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7a890ff214f1..40970a509dcc 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2978,7 +2978,10 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) (le32_to_cpu(sbi->s_es->s_inodes_count))) { /* Insert this inode at the head of the on-disk orphan list */ NEXT_ORPHAN(inode) = le32_to_cpu(sbi->s_es->s_last_orphan); + lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); dirty = true; } list_add(&EXT4_I(inode)->i_orphan, &sbi->s_orphan); @@ -3061,7 +3064,10 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) mutex_unlock(&sbi->s_orphan_lock); goto out_brelse; } + lock_buffer(sbi->s_sbh); sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); + ext4_superblock_csum_set(inode->i_sb); + unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); err = ext4_handle_dirty_super(handle, inode->i_sb); } else { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 928700d57eb6..6155f2b9538c 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -899,7 +899,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, EXT4_SB(sb)->s_gdb_count++; ext4_kvfree_array_rcu(o_group_desc); + lock_buffer(EXT4_SB(sb)->s_sbh); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); err = ext4_handle_dirty_super(handle, sb); if (err) ext4_std_error(sb, err); @@ -1384,6 +1387,7 @@ static void ext4_update_super(struct super_block *sb, reserved_blocks *= blocks_count; do_div(reserved_blocks, 100); + lock_buffer(sbi->s_sbh); ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count); ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks); le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) * @@ -1421,6 +1425,8 @@ static void ext4_update_super(struct super_block *sb, * active. */ ext4_r_blocks_count_set(es, ext4_r_blocks_count(es) + reserved_blocks); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); /* Update the free space counts */ percpu_counter_add(&sbi->s_freeclusters_counter, @@ -1717,8 +1723,11 @@ static int ext4_group_extend_no_check(struct super_block *sb, goto errout; } + lock_buffer(EXT4_SB(sb)->s_sbh); ext4_blocks_count_set(es, o_blocks_count + add); ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ @@ -1874,10 +1883,13 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) if (err) goto errout; + lock_buffer(sbi->s_sbh); ext4_clear_feature_resize_inode(sb); ext4_set_feature_meta_bg(sb); sbi->s_es->s_first_meta_bg = cpu_to_le32(num_desc_blocks(sb, sbi->s_groups_count)); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_super(handle, sb); if (err) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 8bf31003416a..2d653ad14200 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5444,6 +5444,7 @@ static int ext4_commit_super(struct super_block *sb) if (!sbh || block_device_ejected(sb)) return error; + lock_buffer(sbh); /* * If the file system is mounted read-only, don't update the * superblock write time. This avoids updating the superblock @@ -5515,7 +5516,6 @@ static int ext4_commit_super(struct super_block *sb) BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); - lock_buffer(sbh); if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 4e3b1f8c2e81..1db7ca778d69 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -792,7 +792,10 @@ static void ext4_xattr_update_super_block(handle_t *handle, BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { + lock_buffer(EXT4_SB(sb)->s_sbh); ext4_set_feature_xattr(sb); + ext4_superblock_csum_set(sb); + unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_dirty_super(handle, sb); } } From 2d01ddc86606564fb08c56e3bc93a0693895f710 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:40 +0100 Subject: [PATCH 004/241] ext4: save error info to sb through journal if available If journalling is still working at the moment we get to writing error information to the superblock we cannot write directly to the superblock as such write could race with journalled update of the superblock and cause journal checksum failures, writing inconsistent information to the journal or other problems. We cannot journal the superblock directly from the error handling functions as we are running in uncertain context and could deadlock so just punt journalled superblock update to a workqueue. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 103 +++++++++++++++++++++++++++++++++++------------- 1 file changed, 76 insertions(+), 27 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2d653ad14200..9a256c558e04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -65,6 +65,7 @@ static struct ratelimit_state ext4_mount_msg_ratelimit; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_show_options(struct seq_file *seq, struct dentry *root); +static void ext4_update_super(struct super_block *sb); static int ext4_commit_super(struct super_block *sb); static int ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); @@ -586,9 +587,9 @@ static int ext4_errno_to_code(int errno) return EXT4_ERR_UNKNOWN; } -static void __save_error_info(struct super_block *sb, int error, - __u32 ino, __u64 block, - const char *func, unsigned int line) +static void save_error_info(struct super_block *sb, int error, + __u32 ino, __u64 block, + const char *func, unsigned int line) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -615,15 +616,6 @@ static void __save_error_info(struct super_block *sb, int error, spin_unlock(&sbi->s_error_lock); } -static void save_error_info(struct super_block *sb, int error, - __u32 ino, __u64 block, - const char *func, unsigned int line) -{ - __save_error_info(sb, error, ino, block, func, line); - if (!bdev_read_only(sb->s_bdev)) - ext4_commit_super(sb); -} - /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -649,20 +641,35 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, const char *func, unsigned int line) { journal_t *journal = EXT4_SB(sb)->s_journal; + bool continue_fs = !force_ro && test_opt(sb, ERRORS_CONT); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (!bdev_read_only(sb->s_bdev)) - save_error_info(sb, error, ino, block, func, line); + if (!continue_fs && !sb_rdonly(sb)) { + ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + if (journal) + jbd2_journal_abort(journal, -EIO); + } - if (sb_rdonly(sb) || (!force_ro && test_opt(sb, ERRORS_CONT))) + if (!bdev_read_only(sb->s_bdev)) { + save_error_info(sb, error, ino, block, func, line); + /* + * In case the fs should keep running, we need to writeout + * superblock through the journal. Due to lock ordering + * constraints, it may not be safe to do it right here so we + * defer superblock flushing to a workqueue. + */ + if (continue_fs) + schedule_work(&EXT4_SB(sb)->s_error_work); + else + ext4_commit_super(sb); + } + + if (sb_rdonly(sb) || continue_fs) return; - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); - if (journal) - jbd2_journal_abort(journal, -EIO); /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already @@ -685,7 +692,38 @@ static void flush_stashed_error_work(struct work_struct *work) { struct ext4_sb_info *sbi = container_of(work, struct ext4_sb_info, s_error_work); + journal_t *journal = sbi->s_journal; + handle_t *handle; + /* + * If the journal is still running, we have to write out superblock + * through the journal to avoid collisions of other journalled sb + * updates. + * + * We use directly jbd2 functions here to avoid recursing back into + * ext4 error handling code during handling of previous errors. + */ + if (!sb_rdonly(sbi->s_sb) && journal) { + handle = jbd2_journal_start(journal, 1); + if (IS_ERR(handle)) + goto write_directly; + if (jbd2_journal_get_write_access(handle, sbi->s_sbh)) { + jbd2_journal_stop(handle); + goto write_directly; + } + ext4_update_super(sbi->s_sb); + if (jbd2_journal_dirty_metadata(handle, sbi->s_sbh)) { + jbd2_journal_stop(handle); + goto write_directly; + } + jbd2_journal_stop(handle); + return; + } +write_directly: + /* + * Write through journal failed. Write sb directly to get error info + * out and hope for the best. + */ ext4_commit_super(sbi->s_sb); } @@ -944,9 +982,11 @@ __acquires(bitlock) if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - __save_error_info(sb, EFSCORRUPTED, ino, block, function, line); - if (!bdev_read_only(sb->s_bdev)) + if (!bdev_read_only(sb->s_bdev)) { + save_error_info(sb, EFSCORRUPTED, ino, block, function, + line); schedule_work(&EXT4_SB(sb)->s_error_work); + } return; } ext4_unlock_group(sb, grp); @@ -5434,15 +5474,12 @@ err_out: return err; } -static int ext4_commit_super(struct super_block *sb) +/* Copy state of EXT4_SB(sb) into buffer for on-disk superblock */ +static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; - int error = 0; - - if (!sbh || block_device_ejected(sb)) - return error; lock_buffer(sbh); /* @@ -5514,8 +5551,20 @@ static int ext4_commit_super(struct super_block *sb) } spin_unlock(&sbi->s_error_lock); - BUFFER_TRACE(sbh, "marking dirty"); ext4_superblock_csum_set(sb); + unlock_buffer(sbh); +} + +static int ext4_commit_super(struct super_block *sb) +{ + struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + int error = 0; + + if (!sbh || block_device_ejected(sb)) + return error; + + ext4_update_super(sb); + if (buffer_write_io_error(sbh) || !buffer_uptodate(sbh)) { /* * Oh, dear. A previous attempt to write the @@ -5530,8 +5579,8 @@ static int ext4_commit_super(struct super_block *sb) clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } + BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); - unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); if (buffer_write_io_error(sbh)) { From e92ad03fa53498f12b3f5ecb8822adc3bf815b28 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:41 +0100 Subject: [PATCH 005/241] ext4: use sbi instead of EXT4_SB(sb) in ext4_update_super() No behavioral change. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-6-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/super.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9a256c558e04..0f0db49031dc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5478,8 +5478,8 @@ err_out: static void ext4_update_super(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; + struct ext4_super_block *es = sbi->s_es; + struct buffer_head *sbh = sbi->s_sbh; lock_buffer(sbh); /* @@ -5496,21 +5496,20 @@ static void ext4_update_super(struct super_block *sb) ext4_update_tstamp(es, s_wtime); if (sb->s_bdev->bd_part) es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + cpu_to_le64(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[STAT_WRITE]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + sbi->s_sectors_written_start) >> 1)); else - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeclusters_counter)) + es->s_kbytes_written = cpu_to_le64(sbi->s_kbytes_written); + if (percpu_counter_initialized(&sbi->s_freeclusters_counter)) ext4_free_blocks_count_set(es, - EXT4_C2B(EXT4_SB(sb), percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeclusters_counter))); - if (percpu_counter_initialized(&EXT4_SB(sb)->s_freeinodes_counter)) + EXT4_C2B(sbi, percpu_counter_sum_positive( + &sbi->s_freeclusters_counter))); + if (percpu_counter_initialized(&sbi->s_freeinodes_counter)) es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( - &EXT4_SB(sb)->s_freeinodes_counter)); + &sbi->s_freeinodes_counter)); /* Copy error information to the on-disk superblock */ spin_lock(&sbi->s_error_lock); if (sbi->s_add_error_count > 0) { From dfd56c2c0c0dbb11be939b804ddc8d5395ab3432 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:43 +0100 Subject: [PATCH 006/241] ext4: fix superblock checksum failure when setting password salt When setting password salt in the superblock, we forget to recompute the superblock checksum so it will not match until the next superblock modification which recomputes the checksum. Fix it. CC: Michael Halcrow Reported-by: Andreas Dilger Fixes: 9bd8212f981e ("ext4 crypto: add encryption policy and password salt support") Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-8-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index f0381876a7e5..106bf149e8ca 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -1157,7 +1157,10 @@ resizefs_out: err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto pwsalt_err_journal; + lock_buffer(sbi->s_sbh); generate_random_uuid(sbi->s_es->s_encrypt_pw_salt); + ext4_superblock_csum_set(sb); + unlock_buffer(sbi->s_sbh); err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); pwsalt_err_journal: From a3f5cf14ff917d46a4d491cf86210fd639d1ff38 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 16 Dec 2020 11:18:44 +0100 Subject: [PATCH 007/241] ext4: drop ext4_handle_dirty_super() The wrapper is now useless since it does what ext4_handle_dirty_metadata() does. Just remove it. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20201216101844.22917-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.c | 16 ---------------- fs/ext4/ext4_jbd2.h | 5 ----- fs/ext4/file.c | 2 +- fs/ext4/inode.c | 3 ++- fs/ext4/namei.c | 4 ++-- fs/ext4/resize.c | 8 ++++---- fs/ext4/xattr.c | 2 +- 7 files changed, 10 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index c7e410c4ab7d..be799040a415 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -372,19 +372,3 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, } return err; } - -int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb) -{ - struct buffer_head *bh = EXT4_SB(sb)->s_sbh; - int err = 0; - - if (ext4_handle_valid(handle)) { - err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, line, __func__, - bh, handle, err); - } else - mark_buffer_dirty(bh); - return err; -} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index a124c68b0c75..0d2fa423b7ad 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -244,9 +244,6 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, handle_t *handle, struct inode *inode, struct buffer_head *bh); -int __ext4_handle_dirty_super(const char *where, unsigned int line, - handle_t *handle, struct super_block *sb); - #define ext4_journal_get_write_access(handle, bh) \ __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ @@ -257,8 +254,6 @@ int __ext4_handle_dirty_super(const char *where, unsigned int line, #define ext4_handle_dirty_metadata(handle, inode, bh) \ __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ (bh)) -#define ext4_handle_dirty_super(handle, sb) \ - __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *__ext4_journal_start_sb(struct super_block *sb, unsigned int line, int type, int blocks, int rsv_blocks, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 26907d5835d0..1cd3d26e3217 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -814,7 +814,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, sizeof(sbi->s_es->s_last_mounted)); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); - ext4_handle_dirty_super(handle, sb); + ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); out_journal: ext4_journal_stop(handle); out: diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 27de6f0a33c8..c173c8405856 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5155,7 +5155,8 @@ static int ext4_do_update_inode(handle_t *handle, ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); ext4_handle_sync(handle); - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_metadata(handle, NULL, + EXT4_SB(sb)->s_sbh); } ext4_update_inode_fsync_trans(handle, inode, need_datasync); out_brelse: diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 40970a509dcc..a3b28ef2455a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2988,7 +2988,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) mutex_unlock(&sbi->s_orphan_lock); if (dirty) { - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -3069,7 +3069,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) ext4_superblock_csum_set(inode->i_sb); unlock_buffer(sbi->s_sbh); mutex_unlock(&sbi->s_orphan_lock); - err = ext4_handle_dirty_super(handle, inode->i_sb); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6155f2b9538c..bd0d185654f3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -903,7 +903,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, le16_add_cpu(&es->s_reserved_gdt_blocks, -1); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); if (err) ext4_std_error(sb, err); return err; @@ -1521,7 +1521,7 @@ static int ext4_flex_group_add(struct super_block *sb, ext4_update_super(sb, flex_gd); - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); exit_journal: err2 = ext4_journal_stop(handle); @@ -1734,7 +1734,7 @@ static int ext4_group_extend_no_check(struct super_block *sb, err = ext4_group_add_blocks(handle, sb, o_blocks_count, add); if (err) goto errout; - ext4_handle_dirty_super(handle, sb); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); errout: @@ -1891,7 +1891,7 @@ static int ext4_convert_meta_bg(struct super_block *sb, struct inode *inode) ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); - err = ext4_handle_dirty_super(handle, sb); + err = ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); if (err) { ext4_std_error(sb, err); goto errout; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 1db7ca778d69..372208500f4e 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -796,7 +796,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, ext4_set_feature_xattr(sb); ext4_superblock_csum_set(sb); unlock_buffer(EXT4_SB(sb)->s_sbh); - ext4_handle_dirty_super(handle, sb); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } From 5a3b590d4b2db187faa6f06adc9a53d6199fb1f9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 17 Dec 2020 13:24:15 -0500 Subject: [PATCH 008/241] ext4: don't leak old mountpoint samples When the first file is opened, ext4 samples the mountpoint of the filesystem in 64 bytes of the super block. It does so using strlcpy(), this means that the remaining bytes in the super block string buffer are untouched. If the mount point before had a longer path than the current one, it can be reconstructed. Consider the case where the fs was mounted to "/media/johnjdeveloper" and later to "/". The super block buffer then contains "/\x00edia/johnjdeveloper". This case was seen in the wild and caused confusion how the name of a developer ands up on the super block of a filesystem used in production... Fix this by using strncpy() instead of strlcpy(). The superblock field is defined to be a fixed-size char array, and it is already marked using __nonstring in fs/ext4/ext4.h. The consumer of the field in e2fsprogs already assumes that in the case of a 64+ byte mount path, that s_last_mounted will not be NUL terminated. Link: https://lore.kernel.org/r/X9ujIOJG/HqMr88R@mit.edu Reported-by: Richard Weinberger Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 1cd3d26e3217..349b27f0dda0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -810,7 +810,7 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (err) goto out_journal; lock_buffer(sbi->s_sbh); - strlcpy(sbi->s_es->s_last_mounted, cp, + strncpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); ext4_superblock_csum_set(sb); unlock_buffer(sbi->s_sbh); From 4d4f9c1a17a3480f8fe523673f7232b254d724b7 Mon Sep 17 00:00:00 2001 From: Paul Cercueil Date: Wed, 16 Dec 2020 23:39:56 +0000 Subject: [PATCH 009/241] MIPS: boot: Fix unaligned access with CONFIG_MIPS_RAW_APPENDED_DTB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The compressed payload is not necesarily 4-byte aligned, at least when compiling with Clang. In that case, the 4-byte value appended to the compressed payload that corresponds to the uncompressed kernel image size must be read using get_unaligned_le32(). This fixes Clang-built kernels not booting on MIPS (tested on a Ingenic JZ4770 board). Fixes: b8f54f2cde78 ("MIPS: ZBOOT: copy appended dtb to the end of the kernel") Cc: # v4.7 Signed-off-by: Paul Cercueil Reviewed-by: Nick Desaulniers Reviewed-by: Philippe Mathieu-Daudé Signed-off-by: Thomas Bogendoerfer --- arch/mips/boot/compressed/decompress.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/boot/compressed/decompress.c b/arch/mips/boot/compressed/decompress.c index c61c641674e6..e3946b06e840 100644 --- a/arch/mips/boot/compressed/decompress.c +++ b/arch/mips/boot/compressed/decompress.c @@ -13,6 +13,7 @@ #include #include +#include /* * These two variables specify the free mem region @@ -117,7 +118,7 @@ void decompress_kernel(unsigned long boot_heap_start) dtb_size = fdt_totalsize((void *)&__appended_dtb); /* last four bytes is always image size in little endian */ - image_size = le32_to_cpup((void *)&__image_end - 4); + image_size = get_unaligned_le32((void *)&__image_end - 4); /* copy dtb to where the booted kernel will expect it */ memcpy((void *)VMLINUX_LOAD_ADDRESS_ULL + image_size, From 698222457465ce343443be81c5512edda86e5914 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 24 Dec 2020 19:44:38 +0000 Subject: [PATCH 010/241] MIPS: Fix malformed NT_FILE and NT_SIGINFO in 32bit coredumps Patches that introduced NT_FILE and NT_SIGINFO notes back in 2012 had taken care of native (fs/binfmt_elf.c) and compat (fs/compat_binfmt_elf.c) coredumps; unfortunately, compat on mips (which does not go through the usual compat_binfmt_elf.c) had not been noticed. As the result, both N32 and O32 coredumps on 64bit mips kernels have those sections malformed enough to confuse the living hell out of all gdb and readelf versions (up to and including the tip of binutils-gdb.git). Longer term solution is to make both O32 and N32 compat use the regular compat_binfmt_elf.c, but that's too much for backports. The minimal solution is to do in arch/mips/kernel/binfmt_elf[on]32.c the same thing those patches have done in fs/compat_binfmt_elf.c Cc: stable@kernel.org # v3.7+ Signed-off-by: Al Viro Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/binfmt_elfn32.c | 7 +++++++ arch/mips/kernel/binfmt_elfo32.c | 7 +++++++ 2 files changed, 14 insertions(+) diff --git a/arch/mips/kernel/binfmt_elfn32.c b/arch/mips/kernel/binfmt_elfn32.c index 6ee3f7218c67..c4441416e96b 100644 --- a/arch/mips/kernel/binfmt_elfn32.c +++ b/arch/mips/kernel/binfmt_elfn32.c @@ -103,4 +103,11 @@ jiffies_to_old_timeval32(unsigned long jiffies, struct old_timeval32 *value) #undef ns_to_kernel_old_timeval #define ns_to_kernel_old_timeval ns_to_old_timeval32 +/* + * Some data types as stored in coredump. + */ +#define user_long_t compat_long_t +#define user_siginfo_t compat_siginfo_t +#define copy_siginfo_to_external copy_siginfo_to_external32 + #include "../../../fs/binfmt_elf.c" diff --git a/arch/mips/kernel/binfmt_elfo32.c b/arch/mips/kernel/binfmt_elfo32.c index 6dd103d3cebb..7b2a23f48c1a 100644 --- a/arch/mips/kernel/binfmt_elfo32.c +++ b/arch/mips/kernel/binfmt_elfo32.c @@ -106,4 +106,11 @@ jiffies_to_old_timeval32(unsigned long jiffies, struct old_timeval32 *value) #undef ns_to_kernel_old_timeval #define ns_to_kernel_old_timeval ns_to_old_timeval32 +/* + * Some data types as stored in coredump. + */ +#define user_long_t compat_long_t +#define user_siginfo_t compat_siginfo_t +#define copy_siginfo_to_external copy_siginfo_to_external32 + #include "../../../fs/binfmt_elf.c" From cc07d72bf350b77faeffee1c37bc52197171473f Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 24 Sep 2020 13:14:52 -0400 Subject: [PATCH 011/241] dm raid: fix discard limits for raid1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Block core warned that discard_granularity was 0 for dm-raid with personality of raid1. Reason is that raid_io_hints() was incorrectly special-casing raid1 rather than raid0. Fix raid_io_hints() by removing discard limits settings for raid1. Check for raid0 instead. Fixes: 61697a6abd24a ("dm: eliminate 'split_discard_bios' flag from DM target interface") Cc: stable@vger.kernel.org Reported-by: Zdenek Kabelac Reported-by: Mikulas Patocka Reported-by: Stephan Bärwolf Signed-off-by: Mike Snitzer --- drivers/md/dm-raid.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 23c38777e8f6..cab12b2251ba 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3729,10 +3729,10 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); /* - * RAID1 and RAID10 personalities require bio splitting, - * RAID0/4/5/6 don't and process large discard bios properly. + * RAID0 and RAID10 personalities require bio splitting, + * RAID1/4/5/6 don't and process large discard bios properly. */ - if (rs_is_raid1(rs) || rs_is_raid10(rs)) { + if (rs_is_raid0(rs) || rs_is_raid10(rs)) { limits->discard_granularity = chunk_size_bytes; limits->max_discard_sectors = rs->md.chunk_sectors; } From f7b347acb5f6c29d9229bb64893d8b6a2c7949fb Mon Sep 17 00:00:00 2001 From: Anthony Iliopoulos Date: Mon, 14 Dec 2020 18:18:11 +0100 Subject: [PATCH 012/241] dm integrity: select CRYPTO_SKCIPHER The integrity target relies on skcipher for encryption/decryption, but certain kernel configurations may not enable CRYPTO_SKCIPHER, leading to compilation errors due to unresolved symbols. Explicitly select CRYPTO_SKCIPHER for DM_INTEGRITY, since it is unconditionally dependent on it. Signed-off-by: Anthony Iliopoulos Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index b7e2d9666614..a378a8967094 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -605,6 +605,7 @@ config DM_INTEGRITY select BLK_DEV_INTEGRITY select DM_BUFIO select CRYPTO + select CRYPTO_SKCIPHER select ASYNC_XOR help This device-mapper target emulates a block device that has From b690bd546b227c32b860dae985a18bed8aa946fe Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 22:40:51 +0100 Subject: [PATCH 013/241] dm zoned: select CONFIG_CRC32 Without crc32 support, this driver fails to link: arm-linux-gnueabi-ld: drivers/md/dm-zoned-metadata.o: in function `dmz_write_sb': dm-zoned-metadata.c:(.text+0xe98): undefined reference to `crc32_le' arm-linux-gnueabi-ld: drivers/md/dm-zoned-metadata.o: in function `dmz_check_sb': dm-zoned-metadata.c:(.text+0x7978): undefined reference to `crc32_le' Fixes: 3b1a94c88b79 ("dm zoned: drive-managed zoned block device target") Signed-off-by: Arnd Bergmann Reviewed-by: Damien Le Moal Signed-off-by: Mike Snitzer --- drivers/md/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index a378a8967094..9e44c09f6410 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -623,6 +623,7 @@ config DM_ZONED tristate "Drive-managed zoned block device target support" depends on BLK_DEV_DM depends on BLK_DEV_ZONED + select CRC32 help This device-mapper target takes a host-managed or host-aware zoned block device and exposes most of its capacity as a regular block From 8abec36d1274bbd5ae8f36f3658b9abb3db56c31 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 4 Jan 2021 14:59:47 +0000 Subject: [PATCH 014/241] dm crypt: do not wait for backlogged crypto request completion in softirq Commit 39d42fa96ba1 ("dm crypt: add flags to optionally bypass kcryptd workqueues") made it possible for some code paths in dm-crypt to be executed in softirq context, when the underlying driver processes IO requests in interrupt/softirq context. When Crypto API backlogs a crypto request, dm-crypt uses wait_for_completion to avoid sending further requests to an already overloaded crypto driver. However, if the code is executing in softirq context, we might get the following stacktrace: [ 210.235213][ C0] BUG: scheduling while atomic: fio/2602/0x00000102 [ 210.236701][ C0] Modules linked in: [ 210.237566][ C0] CPU: 0 PID: 2602 Comm: fio Tainted: G W 5.10.0+ #50 [ 210.239292][ C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 [ 210.241233][ C0] Call Trace: [ 210.241946][ C0] [ 210.242561][ C0] dump_stack+0x7d/0xa3 [ 210.243466][ C0] __schedule_bug.cold+0xb3/0xc2 [ 210.244539][ C0] __schedule+0x156f/0x20d0 [ 210.245518][ C0] ? io_schedule_timeout+0x140/0x140 [ 210.246660][ C0] schedule+0xd0/0x270 [ 210.247541][ C0] schedule_timeout+0x1fb/0x280 [ 210.248586][ C0] ? usleep_range+0x150/0x150 [ 210.249624][ C0] ? unpoison_range+0x3a/0x60 [ 210.250632][ C0] ? ____kasan_kmalloc.constprop.0+0x82/0xa0 [ 210.251949][ C0] ? unpoison_range+0x3a/0x60 [ 210.252958][ C0] ? __prepare_to_swait+0xa7/0x190 [ 210.254067][ C0] do_wait_for_common+0x2ab/0x370 [ 210.255158][ C0] ? usleep_range+0x150/0x150 [ 210.256192][ C0] ? bit_wait_io_timeout+0x160/0x160 [ 210.257358][ C0] ? blk_update_request+0x757/0x1150 [ 210.258582][ C0] ? _raw_spin_lock_irq+0x82/0xd0 [ 210.259674][ C0] ? _raw_read_unlock_irqrestore+0x30/0x30 [ 210.260917][ C0] wait_for_completion+0x4c/0x90 [ 210.261971][ C0] crypt_convert+0x19a6/0x4c00 [ 210.263033][ C0] ? _raw_spin_lock_irqsave+0x87/0xe0 [ 210.264193][ C0] ? kasan_set_track+0x1c/0x30 [ 210.265191][ C0] ? crypt_iv_tcw_ctr+0x4a0/0x4a0 [ 210.266283][ C0] ? kmem_cache_free+0x104/0x470 [ 210.267363][ C0] ? crypt_endio+0x91/0x180 [ 210.268327][ C0] kcryptd_crypt_read_convert+0x30e/0x420 [ 210.269565][ C0] blk_update_request+0x757/0x1150 [ 210.270563][ C0] blk_mq_end_request+0x4b/0x480 [ 210.271680][ C0] blk_done_softirq+0x21d/0x340 [ 210.272775][ C0] ? _raw_spin_lock+0x81/0xd0 [ 210.273847][ C0] ? blk_mq_stop_hw_queue+0x30/0x30 [ 210.275031][ C0] ? _raw_read_lock_irq+0x40/0x40 [ 210.276182][ C0] __do_softirq+0x190/0x611 [ 210.277203][ C0] ? handle_edge_irq+0x221/0xb60 [ 210.278340][ C0] asm_call_irq_on_stack+0x12/0x20 [ 210.279514][ C0] [ 210.280164][ C0] do_softirq_own_stack+0x37/0x40 [ 210.281281][ C0] irq_exit_rcu+0x110/0x1b0 [ 210.282286][ C0] common_interrupt+0x74/0x120 [ 210.283376][ C0] asm_common_interrupt+0x1e/0x40 [ 210.284496][ C0] RIP: 0010:_aesni_enc1+0x65/0xb0 Fix this by making crypt_convert function reentrant from the point of a single bio and make dm-crypt defer further bio processing to a workqueue, if Crypto API backlogs a request in interrupt context. Fixes: 39d42fa96ba1 ("dm crypt: add flags to optionally bypass kcryptd workqueues") Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Ignat Korchagin Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 103 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 5 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 53791138d78b..527b1475b7a0 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1529,13 +1529,19 @@ static void crypt_free_req(struct crypt_config *cc, void *req, struct bio *base_ * Encrypt / decrypt data from one bio to another one (can be the same one) */ static blk_status_t crypt_convert(struct crypt_config *cc, - struct convert_context *ctx, bool atomic) + struct convert_context *ctx, bool atomic, bool reset_pending) { unsigned int tag_offset = 0; unsigned int sector_step = cc->sector_size >> SECTOR_SHIFT; int r; - atomic_set(&ctx->cc_pending, 1); + /* + * if reset_pending is set we are dealing with the bio for the first time, + * else we're continuing to work on the previous bio, so don't mess with + * the cc_pending counter + */ + if (reset_pending) + atomic_set(&ctx->cc_pending, 1); while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { @@ -1553,7 +1559,25 @@ static blk_status_t crypt_convert(struct crypt_config *cc, * but the driver request queue is full, let's wait. */ case -EBUSY: - wait_for_completion(&ctx->restart); + if (in_interrupt()) { + if (try_wait_for_completion(&ctx->restart)) { + /* + * we don't have to block to wait for completion, + * so proceed + */ + } else { + /* + * we can't wait for completion without blocking + * exit and continue processing in a workqueue + */ + ctx->r.req = NULL; + ctx->cc_sector += sector_step; + tag_offset++; + return BLK_STS_DEV_RESOURCE; + } + } else { + wait_for_completion(&ctx->restart); + } reinit_completion(&ctx->restart); fallthrough; /* @@ -1945,6 +1969,37 @@ static bool kcryptd_crypt_write_inline(struct crypt_config *cc, } } +static void kcryptd_crypt_write_continue(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + struct crypt_config *cc = io->cc; + struct convert_context *ctx = &io->ctx; + int crypt_finished; + sector_t sector = io->sector; + blk_status_t r; + + wait_for_completion(&ctx->restart); + reinit_completion(&ctx->restart); + + r = crypt_convert(cc, &io->ctx, true, false); + if (r) + io->error = r; + crypt_finished = atomic_dec_and_test(&ctx->cc_pending); + if (!crypt_finished && kcryptd_crypt_write_inline(cc, ctx)) { + /* Wait for completion signaled by kcryptd_async_done() */ + wait_for_completion(&ctx->restart); + crypt_finished = 1; + } + + /* Encryption was already finished, submit io now */ + if (crypt_finished) { + kcryptd_crypt_write_io_submit(io, 0); + io->sector = sector; + } + + crypt_dec_pending(io); +} + static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; @@ -1973,7 +2028,17 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) crypt_inc_pending(io); r = crypt_convert(cc, ctx, - test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags)); + test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags), true); + /* + * Crypto API backlogged the request, because its queue was full + * and we're in softirq context, so continue from a workqueue + * (TODO: is it actually possible to be in softirq in the write path?) + */ + if (r == BLK_STS_DEV_RESOURCE) { + INIT_WORK(&io->work, kcryptd_crypt_write_continue); + queue_work(cc->crypt_queue, &io->work); + return; + } if (r) io->error = r; crypt_finished = atomic_dec_and_test(&ctx->cc_pending); @@ -1998,6 +2063,25 @@ static void kcryptd_crypt_read_done(struct dm_crypt_io *io) crypt_dec_pending(io); } +static void kcryptd_crypt_read_continue(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + struct crypt_config *cc = io->cc; + blk_status_t r; + + wait_for_completion(&io->ctx.restart); + reinit_completion(&io->ctx.restart); + + r = crypt_convert(cc, &io->ctx, true, false); + if (r) + io->error = r; + + if (atomic_dec_and_test(&io->ctx.cc_pending)) + kcryptd_crypt_read_done(io); + + crypt_dec_pending(io); +} + static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) { struct crypt_config *cc = io->cc; @@ -2009,7 +2093,16 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) io->sector); r = crypt_convert(cc, &io->ctx, - test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)); + test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags), true); + /* + * Crypto API backlogged the request, because its queue was full + * and we're in softirq context, so continue from a workqueue + */ + if (r == BLK_STS_DEV_RESOURCE) { + INIT_WORK(&io->work, kcryptd_crypt_read_continue); + queue_work(cc->crypt_queue, &io->work); + return; + } if (r) io->error = r; From d68b29584c25dbacd01ed44a3e45abb35353f1de Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Mon, 4 Jan 2021 14:59:48 +0000 Subject: [PATCH 015/241] dm crypt: use GFP_ATOMIC when allocating crypto requests from softirq Commit 39d42fa96ba1 ("dm crypt: add flags to optionally bypass kcryptd workqueues") made it possible for some code paths in dm-crypt to be executed in softirq context, when the underlying driver processes IO requests in interrupt/softirq context. In this case sometimes when allocating a new crypto request we may get a stacktrace like below: [ 210.103008][ C0] BUG: sleeping function called from invalid context at mm/mempool.c:381 [ 210.104746][ C0] in_atomic(): 1, irqs_disabled(): 0, non_block: 0, pid: 2602, name: fio [ 210.106599][ C0] CPU: 0 PID: 2602 Comm: fio Tainted: G W 5.10.0+ #50 [ 210.108331][ C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 0.0.0 02/06/2015 [ 210.110212][ C0] Call Trace: [ 210.110921][ C0] [ 210.111527][ C0] dump_stack+0x7d/0xa3 [ 210.112411][ C0] ___might_sleep.cold+0x122/0x151 [ 210.113527][ C0] mempool_alloc+0x16b/0x2f0 [ 210.114524][ C0] ? __queue_work+0x515/0xde0 [ 210.115553][ C0] ? mempool_resize+0x700/0x700 [ 210.116586][ C0] ? crypt_endio+0x91/0x180 [ 210.117479][ C0] ? blk_update_request+0x757/0x1150 [ 210.118513][ C0] ? blk_mq_end_request+0x4b/0x480 [ 210.119572][ C0] ? blk_done_softirq+0x21d/0x340 [ 210.120628][ C0] ? __do_softirq+0x190/0x611 [ 210.121626][ C0] crypt_convert+0x29f9/0x4c00 [ 210.122668][ C0] ? _raw_spin_lock_irqsave+0x87/0xe0 [ 210.123824][ C0] ? kasan_set_track+0x1c/0x30 [ 210.124858][ C0] ? crypt_iv_tcw_ctr+0x4a0/0x4a0 [ 210.125930][ C0] ? kmem_cache_free+0x104/0x470 [ 210.126973][ C0] ? crypt_endio+0x91/0x180 [ 210.127947][ C0] kcryptd_crypt_read_convert+0x30e/0x420 [ 210.129165][ C0] blk_update_request+0x757/0x1150 [ 210.130231][ C0] blk_mq_end_request+0x4b/0x480 [ 210.131294][ C0] blk_done_softirq+0x21d/0x340 [ 210.132332][ C0] ? _raw_spin_lock+0x81/0xd0 [ 210.133289][ C0] ? blk_mq_stop_hw_queue+0x30/0x30 [ 210.134399][ C0] ? _raw_read_lock_irq+0x40/0x40 [ 210.135458][ C0] __do_softirq+0x190/0x611 [ 210.136409][ C0] ? handle_edge_irq+0x221/0xb60 [ 210.137447][ C0] asm_call_irq_on_stack+0x12/0x20 [ 210.138507][ C0] [ 210.139118][ C0] do_softirq_own_stack+0x37/0x40 [ 210.140191][ C0] irq_exit_rcu+0x110/0x1b0 [ 210.141151][ C0] common_interrupt+0x74/0x120 [ 210.142171][ C0] asm_common_interrupt+0x1e/0x40 Fix this by allocating crypto requests with GFP_ATOMIC mask in interrupt context. Fixes: 39d42fa96ba1 ("dm crypt: add flags to optionally bypass kcryptd workqueues") Cc: stable@vger.kernel.org # v5.9+ Reported-by: Maciej S. Szmigiero Signed-off-by: Ignat Korchagin Acked-by: Mikulas Patocka Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 527b1475b7a0..d2d6d3000f5b 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1454,13 +1454,16 @@ static int crypt_convert_block_skcipher(struct crypt_config *cc, static void kcryptd_async_done(struct crypto_async_request *async_req, int error); -static void crypt_alloc_req_skcipher(struct crypt_config *cc, +static int crypt_alloc_req_skcipher(struct crypt_config *cc, struct convert_context *ctx) { unsigned key_index = ctx->cc_sector & (cc->tfms_count - 1); - if (!ctx->r.req) - ctx->r.req = mempool_alloc(&cc->req_pool, GFP_NOIO); + if (!ctx->r.req) { + ctx->r.req = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO); + if (!ctx->r.req) + return -ENOMEM; + } skcipher_request_set_tfm(ctx->r.req, cc->cipher_tfm.tfms[key_index]); @@ -1471,13 +1474,18 @@ static void crypt_alloc_req_skcipher(struct crypt_config *cc, skcipher_request_set_callback(ctx->r.req, CRYPTO_TFM_REQ_MAY_BACKLOG, kcryptd_async_done, dmreq_of_req(cc, ctx->r.req)); + + return 0; } -static void crypt_alloc_req_aead(struct crypt_config *cc, +static int crypt_alloc_req_aead(struct crypt_config *cc, struct convert_context *ctx) { - if (!ctx->r.req_aead) - ctx->r.req_aead = mempool_alloc(&cc->req_pool, GFP_NOIO); + if (!ctx->r.req) { + ctx->r.req = mempool_alloc(&cc->req_pool, in_interrupt() ? GFP_ATOMIC : GFP_NOIO); + if (!ctx->r.req) + return -ENOMEM; + } aead_request_set_tfm(ctx->r.req_aead, cc->cipher_tfm.tfms_aead[0]); @@ -1488,15 +1496,17 @@ static void crypt_alloc_req_aead(struct crypt_config *cc, aead_request_set_callback(ctx->r.req_aead, CRYPTO_TFM_REQ_MAY_BACKLOG, kcryptd_async_done, dmreq_of_req(cc, ctx->r.req_aead)); + + return 0; } -static void crypt_alloc_req(struct crypt_config *cc, +static int crypt_alloc_req(struct crypt_config *cc, struct convert_context *ctx) { if (crypt_integrity_aead(cc)) - crypt_alloc_req_aead(cc, ctx); + return crypt_alloc_req_aead(cc, ctx); else - crypt_alloc_req_skcipher(cc, ctx); + return crypt_alloc_req_skcipher(cc, ctx); } static void crypt_free_req_skcipher(struct crypt_config *cc, @@ -1545,7 +1555,12 @@ static blk_status_t crypt_convert(struct crypt_config *cc, while (ctx->iter_in.bi_size && ctx->iter_out.bi_size) { - crypt_alloc_req(cc, ctx); + r = crypt_alloc_req(cc, ctx); + if (r) { + complete(&ctx->restart); + return BLK_STS_DEV_RESOURCE; + } + atomic_inc(&ctx->cc_pending); if (crypt_integrity_aead(cc)) From a0a6df9afcaf439a6b4c88a3b522e3d05fdef46f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 4 Jan 2021 15:25:34 -0500 Subject: [PATCH 016/241] umount(2): move the flag validity checks first Unfortunately, there's userland code that used to rely upon these checks being done before anything else to check for UMOUNT_NOFOLLOW support. That broke in 41525f56e256 ("fs: refactor ksys_umount"). Separate those from the rest of checks and move them to ksys_umount(); unlike everything else in there, this can be sanely done there. Reported-by: Sargun Dhillon Fixes: 41525f56e256 ("fs: refactor ksys_umount") Signed-off-by: Al Viro --- fs/namespace.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d2db7dfe232b..9d33909d0f9e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1713,8 +1713,6 @@ static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); - if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) - return -EINVAL; if (!may_mount()) return -EPERM; if (path->dentry != path->mnt->mnt_root) @@ -1728,6 +1726,7 @@ static int can_umount(const struct path *path, int flags) return 0; } +// caller is responsible for flags being sane int path_umount(struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); @@ -1749,6 +1748,10 @@ static int ksys_umount(char __user *name, int flags) struct path path; int ret; + // basic validity checks done first + if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) + return -EINVAL; + if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); From 1d53864c3617f5235f891ca0fbe9347c4cd35d46 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Tue, 22 Dec 2020 15:29:04 +0800 Subject: [PATCH 017/241] scsi: ufs: Fix possible power drain during system suspend Currently if device needs to do flush or BKOP operations, the device VCC power is kept during runtime-suspend period. However, if system suspend is happening while device is runtime-suspended, such power may not be disabled successfully. The reasons may be, 1. If current PM level is the same as SPM level, device will keep runtime-suspended by ufshcd_system_suspend(). 2. Flush recheck work may not be scheduled successfully during system suspend period. If it can wake up the system, this is also not the intention of the recheck work. To fix this issue, simply runtime-resume the device if the flush is allowed during runtime suspend period. Flush capability will be disabled while leaving runtime suspend, and also not be allowed in system suspend period. Link: https://lore.kernel.org/r/20201222072905.32221-2-stanley.chu@mediatek.com Fixes: 51dd905bd2f6 ("scsi: ufs: Fix WriteBooster flush during runtime suspend") Reviewed-by: Chaotian Jing Reviewed-by: Can Guo Signed-off-by: Stanley Chu Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 82ad31781bc9..9db348650f17 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8938,7 +8938,8 @@ int ufshcd_system_suspend(struct ufs_hba *hba) if ((ufs_get_pm_lvl_to_dev_pwr_mode(hba->spm_lvl) == hba->curr_dev_pwr_mode) && (ufs_get_pm_lvl_to_link_pwr_state(hba->spm_lvl) == - hba->uic_link_state)) + hba->uic_link_state) && + !hba->dev_info.b_rpm_dev_flush_capable) goto out; if (pm_runtime_suspended(hba->dev)) { From 21acf4601cc63cf564c6fc1a74d81b191313c929 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Tue, 22 Dec 2020 15:29:05 +0800 Subject: [PATCH 018/241] scsi: ufs: Relax the condition of UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL is intended to skip enabling fWriteBoosterBufferFlushEn while WriteBooster is initializing. Therefore it is better to apply the checking during WriteBooster initialization only. Link: https://lore.kernel.org/r/20201222072905.32221-3-stanley.chu@mediatek.com Reviewed-by: Can Guo Signed-off-by: Stanley Chu Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 9db348650f17..04ab6f2ccac6 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -289,7 +289,8 @@ static inline void ufshcd_wb_config(struct ufs_hba *hba) if (ret) dev_err(hba->dev, "%s: En WB flush during H8: failed: %d\n", __func__, ret); - ufshcd_wb_toggle_flush(hba, true); + if (!(hba->quirks & UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL)) + ufshcd_wb_toggle_flush(hba, true); } static void ufshcd_scsi_unblock_requests(struct ufs_hba *hba) @@ -5436,9 +5437,6 @@ static int ufshcd_wb_toggle_flush_during_h8(struct ufs_hba *hba, bool set) static inline void ufshcd_wb_toggle_flush(struct ufs_hba *hba, bool enable) { - if (hba->quirks & UFSHCI_QUIRK_SKIP_MANUAL_WB_FLUSH_CTRL) - return; - if (enable) ufshcd_wb_buf_flush_enable(hba); else From 4ceb06e7c336f4a8d3f3b6ac9a4fea2e9c97dc07 Mon Sep 17 00:00:00 2001 From: Colin Xu Date: Tue, 1 Dec 2020 14:03:29 +0800 Subject: [PATCH 019/241] drm/i915/gvt: Fix vfio_edid issue for BXT/APL BXT/APL has different isr/irr/hpd regs compared with other GEN9. If not setting these regs bits correctly according to the emulated monitor (currently a DP on PORT_B), although gvt still triggers a virtual HPD event, the guest driver won't detect a valid HPD pulse thus no full display detection will be executed to read the updated EDID. With this patch, the vfio_edid is enabled again on BXT/APL, which is previously disabled. Fixes: 642403e3599e ("drm/i915/gvt: Temporarily disable vfio_edid for BXT/APL") Signed-off-by: Colin Xu Signed-off-by: Zhenyu Wang Link: http://patchwork.freedesktop.org/patch/msgid/20201201060329.142375-1-colin.xu@intel.com Reviewed-by: Zhenyu Wang --- drivers/gpu/drm/i915/gvt/display.c | 85 +++++++++++++++++++++--------- drivers/gpu/drm/i915/gvt/vgpu.c | 5 +- 2 files changed, 61 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/i915/gvt/display.c b/drivers/gpu/drm/i915/gvt/display.c index a15f87539657..62a5b0dd2003 100644 --- a/drivers/gpu/drm/i915/gvt/display.c +++ b/drivers/gpu/drm/i915/gvt/display.c @@ -217,6 +217,15 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) DDI_BUF_CTL_ENABLE); vgpu_vreg_t(vgpu, DDI_BUF_CTL(port)) |= DDI_BUF_IS_IDLE; } + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= + ~(PORTA_HOTPLUG_ENABLE | PORTA_HOTPLUG_STATUS_MASK); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= + ~(PORTB_HOTPLUG_ENABLE | PORTB_HOTPLUG_STATUS_MASK); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= + ~(PORTC_HOTPLUG_ENABLE | PORTC_HOTPLUG_STATUS_MASK); + /* No hpd_invert set in vgpu vbt, need to clear invert mask */ + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= ~BXT_DDI_HPD_INVERT_MASK; + vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) &= ~BXT_DE_PORT_HOTPLUG_MASK; vgpu_vreg_t(vgpu, BXT_P_CR_GT_DISP_PWRON) &= ~(BIT(0) | BIT(1)); vgpu_vreg_t(vgpu, BXT_PORT_CL1CM_DW0(DPIO_PHY0)) &= @@ -273,6 +282,8 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) vgpu_vreg_t(vgpu, TRANS_DDI_FUNC_CTL(TRANSCODER_EDP)) |= (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST | TRANS_DDI_FUNC_ENABLE); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= + PORTA_HOTPLUG_ENABLE; vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= GEN8_DE_PORT_HOTPLUG(HPD_PORT_A); } @@ -301,6 +312,8 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST | (PORT_B << TRANS_DDI_PORT_SHIFT) | TRANS_DDI_FUNC_ENABLE); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= + PORTB_HOTPLUG_ENABLE; vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= GEN8_DE_PORT_HOTPLUG(HPD_PORT_B); } @@ -329,6 +342,8 @@ static void emulate_monitor_status_change(struct intel_vgpu *vgpu) (TRANS_DDI_BPC_8 | TRANS_DDI_MODE_SELECT_DP_SST | (PORT_B << TRANS_DDI_PORT_SHIFT) | TRANS_DDI_FUNC_ENABLE); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= + PORTC_HOTPLUG_ENABLE; vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= GEN8_DE_PORT_HOTPLUG(HPD_PORT_C); } @@ -661,44 +676,62 @@ void intel_vgpu_emulate_hotplug(struct intel_vgpu *vgpu, bool connected) PORTD_HOTPLUG_STATUS_MASK; intel_vgpu_trigger_virtual_event(vgpu, DP_D_HOTPLUG); } else if (IS_BROXTON(i915)) { - if (connected) { - if (intel_vgpu_has_monitor_on_port(vgpu, PORT_A)) { + if (intel_vgpu_has_monitor_on_port(vgpu, PORT_A)) { + if (connected) { vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= GEN8_DE_PORT_HOTPLUG(HPD_PORT_A); - } - if (intel_vgpu_has_monitor_on_port(vgpu, PORT_B)) { - vgpu_vreg_t(vgpu, SFUSE_STRAP) |= - SFUSE_STRAP_DDIB_DETECTED; - vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= - GEN8_DE_PORT_HOTPLUG(HPD_PORT_B); - } - if (intel_vgpu_has_monitor_on_port(vgpu, PORT_C)) { - vgpu_vreg_t(vgpu, SFUSE_STRAP) |= - SFUSE_STRAP_DDIC_DETECTED; - vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= - GEN8_DE_PORT_HOTPLUG(HPD_PORT_C); - } - } else { - if (intel_vgpu_has_monitor_on_port(vgpu, PORT_A)) { + } else { vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) &= ~GEN8_DE_PORT_HOTPLUG(HPD_PORT_A); } - if (intel_vgpu_has_monitor_on_port(vgpu, PORT_B)) { - vgpu_vreg_t(vgpu, SFUSE_STRAP) &= - ~SFUSE_STRAP_DDIB_DETECTED; + vgpu_vreg_t(vgpu, GEN8_DE_PORT_IIR) |= + GEN8_DE_PORT_HOTPLUG(HPD_PORT_A); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= + ~PORTA_HOTPLUG_STATUS_MASK; + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= + PORTA_HOTPLUG_LONG_DETECT; + intel_vgpu_trigger_virtual_event(vgpu, DP_A_HOTPLUG); + } + if (intel_vgpu_has_monitor_on_port(vgpu, PORT_B)) { + if (connected) { + vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= + GEN8_DE_PORT_HOTPLUG(HPD_PORT_B); + vgpu_vreg_t(vgpu, SFUSE_STRAP) |= + SFUSE_STRAP_DDIB_DETECTED; + } else { vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) &= ~GEN8_DE_PORT_HOTPLUG(HPD_PORT_B); - } - if (intel_vgpu_has_monitor_on_port(vgpu, PORT_C)) { vgpu_vreg_t(vgpu, SFUSE_STRAP) &= - ~SFUSE_STRAP_DDIC_DETECTED; + ~SFUSE_STRAP_DDIB_DETECTED; + } + vgpu_vreg_t(vgpu, GEN8_DE_PORT_IIR) |= + GEN8_DE_PORT_HOTPLUG(HPD_PORT_B); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= + ~PORTB_HOTPLUG_STATUS_MASK; + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= + PORTB_HOTPLUG_LONG_DETECT; + intel_vgpu_trigger_virtual_event(vgpu, DP_B_HOTPLUG); + } + if (intel_vgpu_has_monitor_on_port(vgpu, PORT_C)) { + if (connected) { + vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) |= + GEN8_DE_PORT_HOTPLUG(HPD_PORT_C); + vgpu_vreg_t(vgpu, SFUSE_STRAP) |= + SFUSE_STRAP_DDIC_DETECTED; + } else { vgpu_vreg_t(vgpu, GEN8_DE_PORT_ISR) &= ~GEN8_DE_PORT_HOTPLUG(HPD_PORT_C); + vgpu_vreg_t(vgpu, SFUSE_STRAP) &= + ~SFUSE_STRAP_DDIC_DETECTED; } + vgpu_vreg_t(vgpu, GEN8_DE_PORT_IIR) |= + GEN8_DE_PORT_HOTPLUG(HPD_PORT_C); + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) &= + ~PORTC_HOTPLUG_STATUS_MASK; + vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= + PORTC_HOTPLUG_LONG_DETECT; + intel_vgpu_trigger_virtual_event(vgpu, DP_C_HOTPLUG); } - vgpu_vreg_t(vgpu, PCH_PORT_HOTPLUG) |= - PORTB_HOTPLUG_STATUS_MASK; - intel_vgpu_trigger_virtual_event(vgpu, DP_B_HOTPLUG); } } diff --git a/drivers/gpu/drm/i915/gvt/vgpu.c b/drivers/gpu/drm/i915/gvt/vgpu.c index e49944fde333..cbe5931906e0 100644 --- a/drivers/gpu/drm/i915/gvt/vgpu.c +++ b/drivers/gpu/drm/i915/gvt/vgpu.c @@ -437,10 +437,9 @@ static struct intel_vgpu *__intel_gvt_create_vgpu(struct intel_gvt *gvt, if (ret) goto out_clean_sched_policy; - if (IS_BROADWELL(dev_priv)) + if (IS_BROADWELL(dev_priv) || IS_BROXTON(dev_priv)) ret = intel_gvt_hypervisor_set_edid(vgpu, PORT_B); - /* FixMe: Re-enable APL/BXT once vfio_edid enabled */ - else if (!IS_BROXTON(dev_priv)) + else ret = intel_gvt_hypervisor_set_edid(vgpu, PORT_D); if (ret) goto out_clean_sched_policy; From 6948a96a0d69b7e8203758f44849ce4ab06ff788 Mon Sep 17 00:00:00 2001 From: Kiwoong Kim Date: Sat, 19 Dec 2020 15:40:39 +0900 Subject: [PATCH 020/241] scsi: ufs: Relocate flush of exceptional event The current flush location does not guarantee disabling BKOPS for the case of requesting device power off. 1) The exceptional event handler is queued 2) ufs suspend starts with a request of device power off 3) BKOPS is disabled in ufs suspend 4) The queued work for the handler is done and BKOPS is re-enabled Relocate the flush statement to ensure BKOPS remain disabled. Link: https://lore.kernel.org/r/1608360039-16390-1-git-send-email-kwmad.kim@samsung.com Reviewed-by: Can Guo Reviewed-by: Stanley Chu Signed-off-by: Kiwoong Kim Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 04ab6f2ccac6..00ee8cabf8e0 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -8696,6 +8696,8 @@ static int ufshcd_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) ufshcd_wb_need_flush(hba)); } + flush_work(&hba->eeh_work); + if (req_dev_pwr_mode != hba->curr_dev_pwr_mode) { if (!ufshcd_is_runtime_pm(pm_op)) /* ensure that bkops is disabled */ @@ -8708,8 +8710,6 @@ static int ufshcd_suspend(struct ufs_hba *hba, enum ufs_pm_op pm_op) } } - flush_work(&hba->eeh_work); - /* * In the case of DeepSleep, the device is expected to remain powered * with the link off, so do not check for bkops. From 35fc4cd34426c242ab015ef280853b7bff101f48 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Mon, 28 Dec 2020 04:04:36 -0800 Subject: [PATCH 021/241] scsi: ufs: Correct the LUN used in eh_device_reset_handler() callback Users can initiate resets to specific SCSI device/target/host through IOCTL. When this happens, the SCSI cmd passed to eh_device/target/host _reset_handler() callbacks is initialized with a request whose tag is -1. In this case it is not right for eh_device_reset_handler() callback to count on the LUN get from hba->lrb[-1]. Fix it by getting LUN from the SCSI device associated with the SCSI cmd. Link: https://lore.kernel.org/r/1609157080-26283-1-git-send-email-cang@codeaurora.org Reviewed-by: Avri Altman Reviewed-by: Stanley Chu Signed-off-by: Can Guo Signed-off-by: Martin K. Petersen --- drivers/scsi/ufs/ufshcd.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/ufs/ufshcd.c b/drivers/scsi/ufs/ufshcd.c index 00ee8cabf8e0..e31d2c5c7b23 100644 --- a/drivers/scsi/ufs/ufshcd.c +++ b/drivers/scsi/ufs/ufshcd.c @@ -6659,19 +6659,16 @@ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) { struct Scsi_Host *host; struct ufs_hba *hba; - unsigned int tag; u32 pos; int err; - u8 resp = 0xF; - struct ufshcd_lrb *lrbp; + u8 resp = 0xF, lun; unsigned long flags; host = cmd->device->host; hba = shost_priv(host); - tag = cmd->request->tag; - lrbp = &hba->lrb[tag]; - err = ufshcd_issue_tm_cmd(hba, lrbp->lun, 0, UFS_LOGICAL_RESET, &resp); + lun = ufshcd_scsi_to_upiu_lun(cmd->device->lun); + err = ufshcd_issue_tm_cmd(hba, lun, 0, UFS_LOGICAL_RESET, &resp); if (err || resp != UPIU_TASK_MANAGEMENT_FUNC_COMPL) { if (!err) err = resp; @@ -6680,7 +6677,7 @@ static int ufshcd_eh_device_reset_handler(struct scsi_cmnd *cmd) /* clear the commands that were pending for corresponding LUN */ for_each_set_bit(pos, &hba->outstanding_reqs, hba->nutrs) { - if (hba->lrb[pos].lun == lrbp->lun) { + if (hba->lrb[pos].lun == lun) { err = ufshcd_clear_cmd(hba, pos); if (err) break; From d50c7986fbf0e2167279e110a2ed5bd8e811c660 Mon Sep 17 00:00:00 2001 From: Nilesh Javali Date: Thu, 17 Dec 2020 02:51:44 -0800 Subject: [PATCH 022/241] scsi: qedi: Correct max length of CHAP secret The CHAP secret displayed garbage characters causing iSCSI login authentication failure. Correct the CHAP password max length. Link: https://lore.kernel.org/r/20201217105144.8055-1-njavali@marvell.com Reviewed-by: Lee Duncan Signed-off-by: Nilesh Javali Signed-off-by: Martin K. Petersen --- drivers/scsi/qedi/qedi_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qedi/qedi_main.c b/drivers/scsi/qedi/qedi_main.c index f5fc7f518f8a..47ad64b06623 100644 --- a/drivers/scsi/qedi/qedi_main.c +++ b/drivers/scsi/qedi/qedi_main.c @@ -2245,7 +2245,7 @@ qedi_show_boot_tgt_info(struct qedi_ctx *qedi, int type, chap_name); break; case ISCSI_BOOT_TGT_CHAP_SECRET: - rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN, + rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_PWD_MAX_LEN, chap_secret); break; case ISCSI_BOOT_TGT_REV_CHAP_NAME: @@ -2253,7 +2253,7 @@ qedi_show_boot_tgt_info(struct qedi_ctx *qedi, int type, mchap_name); break; case ISCSI_BOOT_TGT_REV_CHAP_SECRET: - rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_NAME_MAX_LEN, + rc = sprintf(buf, "%.*s\n", NVM_ISCSI_CFG_CHAP_PWD_MAX_LEN, mchap_secret); break; case ISCSI_BOOT_TGT_FLAGS: From 39718fe7adb1a79f78be23f058299bc038cbe161 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Dec 2020 17:20:19 +0000 Subject: [PATCH 023/241] scsi: mpt3sas: Fix spelling mistake in Kconfig "compatiblity" -> "compatibility" There is a spelling mistake in the Kconfig help text. Fix it. Link: https://lore.kernel.org/r/20201217172019.57768-1-colin.king@canonical.com Signed-off-by: Colin Ian King Signed-off-by: Martin K. Petersen --- drivers/scsi/mpt3sas/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpt3sas/Kconfig b/drivers/scsi/mpt3sas/Kconfig index 86209455172d..c299f7e078fb 100644 --- a/drivers/scsi/mpt3sas/Kconfig +++ b/drivers/scsi/mpt3sas/Kconfig @@ -79,5 +79,5 @@ config SCSI_MPT2SAS select SCSI_MPT3SAS depends on PCI && SCSI help - Dummy config option for backwards compatiblity: configure the MPT3SAS + Dummy config option for backwards compatibility: configure the MPT3SAS driver instead. From 3b01d7ea4dae907d34fa0eeb3f17bacd714c6d0c Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sat, 26 Dec 2020 14:15:03 +0800 Subject: [PATCH 024/241] scsi: scsi_debug: Fix memleak in scsi_debug_init() When sdeb_zbc_model does not match BLK_ZONED_NONE, BLK_ZONED_HA or BLK_ZONED_HM, we should free sdebug_q_arr to prevent memleak. Also there is no need to execute sdebug_erase_store() on failure of sdeb_zbc_model_str(). Link: https://lore.kernel.org/r/20201226061503.20050-1-dinghao.liu@zju.edu.cn Acked-by: Douglas Gilbert Signed-off-by: Dinghao Liu Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 24c0f7ec0351..4a08c450b756 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -6740,7 +6740,7 @@ static int __init scsi_debug_init(void) k = sdeb_zbc_model_str(sdeb_zbc_model_s); if (k < 0) { ret = k; - goto free_vm; + goto free_q_arr; } sdeb_zbc_model = k; switch (sdeb_zbc_model) { @@ -6753,7 +6753,8 @@ static int __init scsi_debug_init(void) break; default: pr_err("Invalid ZBC model\n"); - return -EINVAL; + ret = -EINVAL; + goto free_q_arr; } } if (sdeb_zbc_model != BLK_ZONED_NONE) { From e5cc9002caafacbaa8dab878d17a313192c3b03b Mon Sep 17 00:00:00 2001 From: "Ewan D. Milne" Date: Mon, 7 Dec 2020 17:10:21 -0500 Subject: [PATCH 025/241] scsi: sd: Suppress spurious errors when WRITE SAME is being disabled The block layer code will split a large zeroout request into multiple bios and if WRITE SAME is disabled because the storage device reports that it does not support it (or support the length used), we can get an error message from the block layer despite the setting of RQF_QUIET on the first request. This is because more than one request may have already been submitted. Fix this by setting RQF_QUIET when BLK_STS_TARGET is returned to fail the request early, we don't need to log a message because we did not actually submit the command to the device, and the block layer code will handle the error by submitting individual write bios. Link: https://lore.kernel.org/r/20201207221021.28243-1-emilne@redhat.com Reviewed-by: Christoph Hellwig Signed-off-by: Ewan D. Milne Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 679c2c025047..b766ad54e4a5 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -984,8 +984,10 @@ static blk_status_t sd_setup_write_zeroes_cmnd(struct scsi_cmnd *cmd) } } - if (sdp->no_write_same) + if (sdp->no_write_same) { + rq->rq_flags |= RQF_QUIET; return BLK_STS_TARGET; + } if (sdkp->ws16 || lba > 0xffffffff || nr_blocks > 0xffff) return sd_setup_write_same16_cmnd(cmd, false); From be2553358cd40c0db11d1aa96f819c07413b2aae Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Mon, 14 Dec 2020 10:54:24 +0100 Subject: [PATCH 026/241] scsi: sd: Remove obsolete variable in sd_remove() Commit 996e509bbc95 ("sd: use __register_blkdev to avoid a modprobe for an unregistered dev_t") removed blk_register_region(devt, ...) in sd_remove() and since then, devt is unused in sd_remove(). Hence, make W=1 warns: drivers/scsi/sd.c:3516:8: warning: variable 'devt' set but not used [-Wunused-but-set-variable] Simply remove this obsolete variable. [mkp: fixed commit sha] Link: https://lore.kernel.org/r/20201214095424.12479-1-lukas.bulwahn@gmail.com Reviewed-by: Christoph Hellwig Reviewed-by: Nathan Chancellor Acked-by: Martin K. Petersen Signed-off-by: Lukas Bulwahn Signed-off-by: Martin K. Petersen --- drivers/scsi/sd.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index b766ad54e4a5..a3d2d4bc4a3d 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -3512,10 +3512,8 @@ static int sd_probe(struct device *dev) static int sd_remove(struct device *dev) { struct scsi_disk *sdkp; - dev_t devt; sdkp = dev_get_drvdata(dev); - devt = disk_devt(sdkp->disk); scsi_autopm_get_device(sdkp->device); async_synchronize_full_domain(&scsi_sd_pm_domain); From 8ae291cc95e49011b736b641b0cfad502b7a1526 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Tue, 5 Jan 2021 13:13:27 +0200 Subject: [PATCH 027/241] RDMA/ucma: Do not miss ctx destruction steps in some cases The destruction flow is very complicated here because the cm_id can be destroyed from the event handler at any time if the device is hot-removed. This leaves behind a partial ctx with no cm_id in the xarray, and will let user space leak memory. Make everything consistent in this flow in all places: - Return the xarray back to XA_ZERO_ENTRY before beginning any destruction. The thread that reaches this first is responsible to kfree, everyone else does nothing. - Test the xarray during the special hot-removal case to block the queue_work, this has much simpler locking and doesn't require a 'destroying' - Fix the ref initialization so that it is only positive if cm_id != NULL, then rely on that to guide the destruction process in all cases. Now the new ucma_destroy_private_ctx() can be called in all places that want to free the ctx, including all the error unwinds, and none of the details are missed. Fixes: a1d33b70dbbc ("RDMA/ucma: Rework how new connections are passed through event delivery") Link: https://lore.kernel.org/r/20210105111327.230270-1-leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/ucma.c | 137 ++++++++++++++++++--------------- 1 file changed, 73 insertions(+), 64 deletions(-) diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c index 7dab9a27a145..da2512c30ffd 100644 --- a/drivers/infiniband/core/ucma.c +++ b/drivers/infiniband/core/ucma.c @@ -95,8 +95,6 @@ struct ucma_context { u64 uid; struct list_head list; - /* sync between removal event and id destroy, protected by file mut */ - int destroying; struct work_struct close_work; }; @@ -122,7 +120,7 @@ static DEFINE_XARRAY_ALLOC(ctx_table); static DEFINE_XARRAY_ALLOC(multicast_table); static const struct file_operations ucma_fops; -static int __destroy_id(struct ucma_context *ctx); +static int ucma_destroy_private_ctx(struct ucma_context *ctx); static inline struct ucma_context *_ucma_find_context(int id, struct ucma_file *file) @@ -179,19 +177,14 @@ static void ucma_close_id(struct work_struct *work) /* once all inflight tasks are finished, we close all underlying * resources. The context is still alive till its explicit destryoing - * by its creator. + * by its creator. This puts back the xarray's reference. */ ucma_put_ctx(ctx); wait_for_completion(&ctx->comp); /* No new events will be generated after destroying the id. */ rdma_destroy_id(ctx->cm_id); - /* - * At this point ctx->ref is zero so the only place the ctx can be is in - * a uevent or in __destroy_id(). Since the former doesn't touch - * ctx->cm_id and the latter sync cancels this, there is no races with - * this store. - */ + /* Reading the cm_id without holding a positive ref is not allowed */ ctx->cm_id = NULL; } @@ -204,7 +197,6 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) return NULL; INIT_WORK(&ctx->close_work, ucma_close_id); - refcount_set(&ctx->ref, 1); init_completion(&ctx->comp); /* So list_del() will work if we don't do ucma_finish_ctx() */ INIT_LIST_HEAD(&ctx->list); @@ -218,6 +210,13 @@ static struct ucma_context *ucma_alloc_ctx(struct ucma_file *file) return ctx; } +static void ucma_set_ctx_cm_id(struct ucma_context *ctx, + struct rdma_cm_id *cm_id) +{ + refcount_set(&ctx->ref, 1); + ctx->cm_id = cm_id; +} + static void ucma_finish_ctx(struct ucma_context *ctx) { lockdep_assert_held(&ctx->file->mut); @@ -303,7 +302,7 @@ static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, ctx = ucma_alloc_ctx(listen_ctx->file); if (!ctx) goto err_backlog; - ctx->cm_id = cm_id; + ucma_set_ctx_cm_id(ctx, cm_id); uevent = ucma_create_uevent(listen_ctx, event); if (!uevent) @@ -321,8 +320,7 @@ static int ucma_connect_event_handler(struct rdma_cm_id *cm_id, return 0; err_alloc: - xa_erase(&ctx_table, ctx->id); - kfree(ctx); + ucma_destroy_private_ctx(ctx); err_backlog: atomic_inc(&listen_ctx->backlog); /* Returning error causes the new ID to be destroyed */ @@ -356,8 +354,12 @@ static int ucma_event_handler(struct rdma_cm_id *cm_id, wake_up_interruptible(&ctx->file->poll_wait); } - if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL && !ctx->destroying) - queue_work(system_unbound_wq, &ctx->close_work); + if (event->event == RDMA_CM_EVENT_DEVICE_REMOVAL) { + xa_lock(&ctx_table); + if (xa_load(&ctx_table, ctx->id) == ctx) + queue_work(system_unbound_wq, &ctx->close_work); + xa_unlock(&ctx_table); + } return 0; } @@ -461,13 +463,12 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, ret = PTR_ERR(cm_id); goto err1; } - ctx->cm_id = cm_id; + ucma_set_ctx_cm_id(ctx, cm_id); resp.id = ctx->id; if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) { - xa_erase(&ctx_table, ctx->id); - __destroy_id(ctx); + ucma_destroy_private_ctx(ctx); return -EFAULT; } @@ -477,8 +478,7 @@ static ssize_t ucma_create_id(struct ucma_file *file, const char __user *inbuf, return 0; err1: - xa_erase(&ctx_table, ctx->id); - kfree(ctx); + ucma_destroy_private_ctx(ctx); return ret; } @@ -516,68 +516,73 @@ static void ucma_cleanup_mc_events(struct ucma_multicast *mc) rdma_unlock_handler(mc->ctx->cm_id); } -/* - * ucma_free_ctx is called after the underlying rdma CM-ID is destroyed. At - * this point, no new events will be reported from the hardware. However, we - * still need to cleanup the UCMA context for this ID. Specifically, there - * might be events that have not yet been consumed by the user space software. - * mutex. After that we release them as needed. - */ -static int ucma_free_ctx(struct ucma_context *ctx) +static int ucma_cleanup_ctx_events(struct ucma_context *ctx) { int events_reported; struct ucma_event *uevent, *tmp; LIST_HEAD(list); - ucma_cleanup_multicast(ctx); - - /* Cleanup events not yet reported to the user. */ + /* Cleanup events not yet reported to the user.*/ mutex_lock(&ctx->file->mut); list_for_each_entry_safe(uevent, tmp, &ctx->file->event_list, list) { - if (uevent->ctx == ctx || uevent->conn_req_ctx == ctx) + if (uevent->ctx != ctx) + continue; + + if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && + xa_cmpxchg(&ctx_table, uevent->conn_req_ctx->id, + uevent->conn_req_ctx, XA_ZERO_ENTRY, + GFP_KERNEL) == uevent->conn_req_ctx) { list_move_tail(&uevent->list, &list); + continue; + } + list_del(&uevent->list); + kfree(uevent); } list_del(&ctx->list); events_reported = ctx->events_reported; mutex_unlock(&ctx->file->mut); /* - * If this was a listening ID then any connections spawned from it - * that have not been delivered to userspace are cleaned up too. - * Must be done outside any locks. + * If this was a listening ID then any connections spawned from it that + * have not been delivered to userspace are cleaned up too. Must be done + * outside any locks. */ list_for_each_entry_safe(uevent, tmp, &list, list) { - list_del(&uevent->list); - if (uevent->resp.event == RDMA_CM_EVENT_CONNECT_REQUEST && - uevent->conn_req_ctx != ctx) - __destroy_id(uevent->conn_req_ctx); + ucma_destroy_private_ctx(uevent->conn_req_ctx); kfree(uevent); } - - mutex_destroy(&ctx->mutex); - kfree(ctx); return events_reported; } -static int __destroy_id(struct ucma_context *ctx) +/* + * When this is called the xarray must have a XA_ZERO_ENTRY in the ctx->id (ie + * the ctx is not public to the user). This either because: + * - ucma_finish_ctx() hasn't been called + * - xa_cmpxchg() succeed to remove the entry (only one thread can succeed) + */ +static int ucma_destroy_private_ctx(struct ucma_context *ctx) { - /* - * If the refcount is already 0 then ucma_close_id() has already - * destroyed the cm_id, otherwise holding the refcount keeps cm_id - * valid. Prevent queue_work() from being called. - */ - if (refcount_inc_not_zero(&ctx->ref)) { - rdma_lock_handler(ctx->cm_id); - ctx->destroying = 1; - rdma_unlock_handler(ctx->cm_id); - ucma_put_ctx(ctx); - } + int events_reported; + /* + * Destroy the underlying cm_id. New work queuing is prevented now by + * the removal from the xarray. Once the work is cancled ref will either + * be 0 because the work ran to completion and consumed the ref from the + * xarray, or it will be positive because we still have the ref from the + * xarray. This can also be 0 in cases where cm_id was never set + */ cancel_work_sync(&ctx->close_work); - /* At this point it's guaranteed that there is no inflight closing task */ - if (ctx->cm_id) + if (refcount_read(&ctx->ref)) ucma_close_id(&ctx->close_work); - return ucma_free_ctx(ctx); + + events_reported = ucma_cleanup_ctx_events(ctx); + ucma_cleanup_multicast(ctx); + + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, XA_ZERO_ENTRY, NULL, + GFP_KERNEL) != NULL); + mutex_destroy(&ctx->mutex); + kfree(ctx); + return events_reported; } static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, @@ -596,14 +601,17 @@ static ssize_t ucma_destroy_id(struct ucma_file *file, const char __user *inbuf, xa_lock(&ctx_table); ctx = _ucma_find_context(cmd.id, file); - if (!IS_ERR(ctx)) - __xa_erase(&ctx_table, ctx->id); + if (!IS_ERR(ctx)) { + if (__xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx) + ctx = ERR_PTR(-ENOENT); + } xa_unlock(&ctx_table); if (IS_ERR(ctx)) return PTR_ERR(ctx); - resp.events_reported = __destroy_id(ctx); + resp.events_reported = ucma_destroy_private_ctx(ctx); if (copy_to_user(u64_to_user_ptr(cmd.response), &resp, sizeof(resp))) ret = -EFAULT; @@ -1777,15 +1785,16 @@ static int ucma_close(struct inode *inode, struct file *filp) * prevented by this being a FD release function. The list_add_tail() in * ucma_connect_event_handler() can run concurrently, however it only * adds to the list *after* a listening ID. By only reading the first of - * the list, and relying on __destroy_id() to block + * the list, and relying on ucma_destroy_private_ctx() to block * ucma_connect_event_handler(), no additional locking is needed. */ while (!list_empty(&file->ctx_list)) { struct ucma_context *ctx = list_first_entry( &file->ctx_list, struct ucma_context, list); - xa_erase(&ctx_table, ctx->id); - __destroy_id(ctx); + WARN_ON(xa_cmpxchg(&ctx_table, ctx->id, ctx, XA_ZERO_ENTRY, + GFP_KERNEL) != ctx); + ucma_destroy_private_ctx(ctx); } kfree(file); return 0; From fcc42338375a1e67b8568dbb558f8b784d0f3b01 Mon Sep 17 00:00:00 2001 From: Akilesh Kailash Date: Mon, 28 Dec 2020 07:14:07 +0000 Subject: [PATCH 028/241] dm snapshot: flush merged data before committing metadata If the origin device has a volatile write-back cache and the following events occur: 1: After finishing merge operation of one set of exceptions, merge_callback() is invoked. 2: Update the metadata in COW device tracking the merge completion. This update to COW device is flushed cleanly. 3: System crashes and the origin device's cache where the recent merge was completed has not been flushed. During the next cycle when we read the metadata from the COW device, we will skip reading those metadata whose merge was completed in step (1). This will lead to data loss/corruption. To address this, flush the origin device post merge IO before updating the metadata. Cc: stable@vger.kernel.org Signed-off-by: Akilesh Kailash Signed-off-by: Mike Snitzer --- drivers/md/dm-snap.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c index 4668b2cd98f4..11890db71f3f 100644 --- a/drivers/md/dm-snap.c +++ b/drivers/md/dm-snap.c @@ -141,6 +141,11 @@ struct dm_snapshot { * for them to be committed. */ struct bio_list bios_queued_during_merge; + + /* + * Flush data after merge. + */ + struct bio flush_bio; }; /* @@ -1121,6 +1126,17 @@ shut: static void error_bios(struct bio *bio); +static int flush_data(struct dm_snapshot *s) +{ + struct bio *flush_bio = &s->flush_bio; + + bio_reset(flush_bio); + bio_set_dev(flush_bio, s->origin->bdev); + flush_bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + return submit_bio_wait(flush_bio); +} + static void merge_callback(int read_err, unsigned long write_err, void *context) { struct dm_snapshot *s = context; @@ -1134,6 +1150,11 @@ static void merge_callback(int read_err, unsigned long write_err, void *context) goto shut; } + if (flush_data(s) < 0) { + DMERR("Flush after merge failed: shutting down merge"); + goto shut; + } + if (s->store->type->commit_merge(s->store, s->num_merging_chunks) < 0) { DMERR("Write error in exception store: shutting down merge"); @@ -1318,6 +1339,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv) s->first_merging_chunk = 0; s->num_merging_chunks = 0; bio_list_init(&s->bios_queued_during_merge); + bio_init(&s->flush_bio, NULL, 0); /* Allocate hash table for COW data */ if (init_hash_tables(s)) { @@ -1504,6 +1526,8 @@ static void snapshot_dtr(struct dm_target *ti) dm_exception_store_destroy(s->store); + bio_uninit(&s->flush_bio); + dm_put_device(ti, s->cow); dm_put_device(ti, s->origin); From 3c638cdb8ecc0442552156e0fed8708dd2c7f35b Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 16 Dec 2020 12:07:53 +0200 Subject: [PATCH 029/241] RDMA/restrack: Don't treat as an error allocation ID wrapping xa_alloc_cyclic() call returns positive number if ID allocation succeeded but wrapped. It is not an error, so normalize the "ret" variable to zero as marker of not-an-error. drivers/infiniband/core/restrack.c:261 rdma_restrack_add() warn: 'ret' can be either negative or positive Fixes: fd47c2f99f04 ("RDMA/restrack: Convert internal DB from hash to XArray") Link: https://lore.kernel.org/r/20201216100753.1127638-1-leon@kernel.org Reported-by: Dan Carpenter Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/restrack.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/infiniband/core/restrack.c b/drivers/infiniband/core/restrack.c index e0a41c867002..ff1551b3cf61 100644 --- a/drivers/infiniband/core/restrack.c +++ b/drivers/infiniband/core/restrack.c @@ -254,6 +254,7 @@ void rdma_restrack_add(struct rdma_restrack_entry *res) } else { ret = xa_alloc_cyclic(&rt->xa, &res->id, res, xa_limit_32b, &rt->next_id, GFP_KERNEL); + ret = (ret < 0) ? ret : 0; } out: From a306aba9c8d869b1fdfc8ad9237f1ed718ea55e6 Mon Sep 17 00:00:00 2001 From: Dinghao Liu Date: Sat, 26 Dec 2020 15:42:48 +0800 Subject: [PATCH 030/241] RDMA/usnic: Fix memleak in find_free_vf_and_create_qp_grp If usnic_ib_qp_grp_create() fails at the first call, dev_list will not be freed on error, which leads to memleak. Fixes: e3cf00d0a87f ("IB/usnic: Add Cisco VIC low-level hardware driver") Link: https://lore.kernel.org/r/20201226074248.2893-1-dinghao.liu@zju.edu.cn Signed-off-by: Dinghao Liu Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/usnic/usnic_ib_verbs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index 38a37770c016..3705c6b8b223 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -214,6 +214,7 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, } usnic_uiom_free_dev_list(dev_list); + dev_list = NULL; } /* Try to find resources on an unused vf */ @@ -239,6 +240,8 @@ find_free_vf_and_create_qp_grp(struct usnic_ib_dev *us_ibdev, qp_grp_check: if (IS_ERR_OR_NULL(qp_grp)) { usnic_err("Failed to allocate qp_grp\n"); + if (usnic_ib_share_vf) + usnic_uiom_free_dev_list(dev_list); return ERR_PTR(qp_grp ? PTR_ERR(qp_grp) : -ENOMEM); } return qp_grp; From f2bc3af6353cb2a33dfa9d270d999d839eef54cb Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 29 Dec 2020 18:46:53 -0800 Subject: [PATCH 031/241] RDMA/ocrdma: Fix use after free in ocrdma_dealloc_ucontext_pd() In ocrdma_dealloc_ucontext_pd() uctx->cntxt_pd is assigned to the variable pd and then after uctx->cntxt_pd is freed, the variable pd is passed to function _ocrdma_dealloc_pd() which dereferences pd directly or through its call to ocrdma_mbx_dealloc_pd(). Reorder the free using the variable pd. Cc: stable@vger.kernel.org Fixes: 21a428a019c9 ("RDMA: Handle PD allocations by IB/core") Link: https://lore.kernel.org/r/20201230024653.1516495-1-trix@redhat.com Signed-off-by: Tom Rix Reviewed-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/ocrdma/ocrdma_verbs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index bc98bd950d99..3acb5c10b155 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -434,9 +434,9 @@ static void ocrdma_dealloc_ucontext_pd(struct ocrdma_ucontext *uctx) pr_err("%s(%d) Freeing in use pdid=0x%x.\n", __func__, dev->id, pd->id); } - kfree(uctx->cntxt_pd); uctx->cntxt_pd = NULL; _ocrdma_dealloc_pd(dev, pd); + kfree(pd); } static struct ocrdma_pd *ocrdma_get_ucontext_pd(struct ocrdma_ucontext *uctx) From cf7b2ae4d70432fa94ebba3fbaab825481ae7189 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 21 Dec 2020 23:52:00 +0100 Subject: [PATCH 032/241] riscv: return -ENOSYS for syscall -1 Properly return -ENOSYS for syscall -1 instead of leaving the return value uninitialized. This fixes the strace teststuite. Fixes: 5340627e3fe0 ("riscv: add support for SECCOMP and SECCOMP_FILTER") Cc: stable@vger.kernel.org Signed-off-by: Andreas Schwab Reviewed-by: Tycho Andersen Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 524d918f3601..d07763001eb0 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -186,14 +186,7 @@ check_syscall_nr: * Syscall number held in a7. * If syscall number is above allowed value, redirect to ni_syscall. */ - bge a7, t0, 1f - /* - * Check if syscall is rejected by tracer, i.e., a7 == -1. - * If yes, we pretend it was executed. - */ - li t1, -1 - beq a7, t1, ret_from_syscall_rejected - blt a7, t1, 1f + bgeu a7, t0, 1f /* Call syscall */ la s0, sys_call_table slli t0, a7, RISCV_LGPTR From 11f4c2e940e2f317c9d8fb5a79702f2a4a02ff98 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:34 +0900 Subject: [PATCH 033/241] riscv: Fix kernel time_init() If of_clk_init() is not called in time_init(), clock providers defined in the system device tree are not initialized, resulting in failures for other devices to initialize due to missing clocks. Similarly to other architectures and to the default kernel time_init() implementation, call of_clk_init() before executing timer_probe() in time_init(). Signed-off-by: Damien Le Moal Acked-by: Stephen Boyd Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/time.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/riscv/kernel/time.c b/arch/riscv/kernel/time.c index 4d3a1048ad8b..8a5cf99c0776 100644 --- a/arch/riscv/kernel/time.c +++ b/arch/riscv/kernel/time.c @@ -4,6 +4,7 @@ * Copyright (C) 2017 SiFive */ +#include #include #include #include @@ -24,6 +25,8 @@ void __init time_init(void) riscv_timebase = prop; lpj_fine = riscv_timebase / HZ; + + of_clk_init(NULL); timer_probe(); } From 1f1496a923b6ba16679074fe77100e1b53cdb880 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:35 +0900 Subject: [PATCH 034/241] riscv: Fix sifive serial driver Setup the port uartclk in sifive_serial_probe() so that the base baud rate is correctly printed during device probe instead of always showing "0". I.e. the probe message is changed from 38000000.serial: ttySIF0 at MMIO 0x38000000 (irq = 1, base_baud = 0) is a SiFive UART v0 to the correct: 38000000.serial: ttySIF0 at MMIO 0x38000000 (irq = 1, base_baud = 115200) is a SiFive UART v0 Signed-off-by: Damien Le Moal Reviewed-by: Palmer Dabbelt Acked-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- drivers/tty/serial/sifive.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/tty/serial/sifive.c b/drivers/tty/serial/sifive.c index 1066eebe3b28..328d5a78792f 100644 --- a/drivers/tty/serial/sifive.c +++ b/drivers/tty/serial/sifive.c @@ -1000,6 +1000,7 @@ static int sifive_serial_probe(struct platform_device *pdev) /* Set up clock divider */ ssp->clkin_rate = clk_get_rate(ssp->clk); ssp->baud_rate = SIFIVE_DEFAULT_BAUD_RATE; + ssp->port.uartclk = ssp->baud_rate * 16; __ssp_update_div(ssp); platform_set_drvdata(pdev, ssp); From 643437b996bac9267785e0bd528332e2d5811067 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Dec 2020 22:50:36 +0900 Subject: [PATCH 035/241] riscv: Enable interrupts during syscalls with M-Mode When running is M-Mode (no MMU config), MPIE does not get set. This results in all syscalls being executed with interrupts disabled as handle_exception never sets SR_IE as it always sees SR_PIE being cleared. Fix this by always force enabling interrupts in handle_syscall when CONFIG_RISCV_M_MODE is enabled. Signed-off-by: Damien Le Moal Reviewed-by: Palmer Dabbelt Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index d07763001eb0..48de316c68c1 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -155,6 +155,15 @@ skip_context_tracking: tail do_trap_unknown handle_syscall: +#ifdef CONFIG_RISCV_M_MODE + /* + * When running is M-Mode (no MMU config), MPIE does not get set. + * As a result, we need to force enable interrupts here because + * handle_exception did not do set SR_IE as it always sees SR_PIE + * being cleared. + */ + csrs CSR_STATUS, SR_IE +#endif #if defined(CONFIG_TRACE_IRQFLAGS) || defined(CONFIG_CONTEXT_TRACKING) /* Recover a0 - a7 for system calls */ REG_L a0, PT_A0(sp) From 3c02e04fd4f57130e4fa75fab6f528f7a52db9b5 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Thu, 31 Dec 2020 00:33:18 +0300 Subject: [PATCH 036/241] crypto: xor - Fix divide error in do_xor_speed() crypto: Fix divide error in do_xor_speed() From: Kirill Tkhai Latest (but not only latest) linux-next panics with divide error on my QEMU setup. The patch at the bottom of this message fixes the problem. xor: measuring software checksum speed divide error: 0000 [#1] PREEMPT SMP KASAN PREEMPT SMP KASAN CPU: 3 PID: 1 Comm: swapper/0 Not tainted 5.10.0-next-20201223+ #2177 RIP: 0010:do_xor_speed+0xbb/0xf3 Code: 41 ff cc 75 b5 bf 01 00 00 00 e8 3d 23 8b fe 65 8b 05 f6 49 83 7d 85 c0 75 05 e8 84 70 81 fe b8 00 00 50 c3 31 d2 48 8d 7b 10 f5 41 89 c4 e8 58 07 a2 fe 44 89 63 10 48 8d 7b 08 e8 cb 07 a2 RSP: 0000:ffff888100137dc8 EFLAGS: 00010246 RAX: 00000000c3500000 RBX: ffffffff823f0160 RCX: 0000000000000000 RDX: 0000000000000000 RSI: 0000000000000808 RDI: ffffffff823f0170 RBP: 0000000000000000 R08: ffffffff8109c50f R09: ffffffff824bb6f7 R10: fffffbfff04976de R11: 0000000000000001 R12: 0000000000000000 R13: ffff888101997000 R14: ffff888101994000 R15: ffffffff823f0178 FS: 0000000000000000(0000) GS:ffff8881f7780000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000000220e000 CR4: 00000000000006a0 Call Trace: calibrate_xor_blocks+0x13c/0x1c4 ? do_xor_speed+0xf3/0xf3 do_one_initcall+0xc1/0x1b7 ? start_kernel+0x373/0x373 ? unpoison_range+0x3a/0x60 kernel_init_freeable+0x1dd/0x238 ? rest_init+0xc6/0xc6 kernel_init+0x8/0x10a ret_from_fork+0x1f/0x30 ---[ end trace 5bd3c1d0b77772da ]--- Fixes: c055e3eae0f1 ("crypto: xor - use ktime for template benchmarking") Cc: Signed-off-by: Kirill Tkhai Acked-by: Ard Biesheuvel Signed-off-by: Herbert Xu --- crypto/xor.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/crypto/xor.c b/crypto/xor.c index eacbf4f93990..8f899f898ec9 100644 --- a/crypto/xor.c +++ b/crypto/xor.c @@ -107,6 +107,8 @@ do_xor_speed(struct xor_block_template *tmpl, void *b1, void *b2) preempt_enable(); // bytes/ns == GB/s, multiply by 1000 to get MB/s [not MiB/s] + if (!min) + min = 1; speed = (1000 * REPS * BENCH_SIZE) / (unsigned int)ktime_to_ns(min); tmpl->speed = speed; From 382811940303f7cd01d0f3dcdf432dfd89c5a98e Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Sun, 3 Jan 2021 15:03:04 +0100 Subject: [PATCH 037/241] crypto: omap-sham - Fix link error without crypto-engine The driver was converted to use the crypto engine helper but is missing the corresponding Kconfig statement to ensure it is available: arm-linux-gnueabi-ld: drivers/crypto/omap-sham.o: in function `omap_sham_probe': omap-sham.c:(.text+0x374): undefined reference to `crypto_engine_alloc_init' arm-linux-gnueabi-ld: omap-sham.c:(.text+0x384): undefined reference to `crypto_engine_start' arm-linux-gnueabi-ld: omap-sham.c:(.text+0x510): undefined reference to `crypto_engine_exit' arm-linux-gnueabi-ld: drivers/crypto/omap-sham.o: in function `omap_sham_finish_req': omap-sham.c:(.text+0x98c): undefined reference to `crypto_finalize_hash_request' arm-linux-gnueabi-ld: omap-sham.c:(.text+0x9a0): undefined reference to `crypto_transfer_hash_request_to_engine' arm-linux-gnueabi-ld: drivers/crypto/omap-sham.o: in function `omap_sham_update': omap-sham.c:(.text+0xf24): undefined reference to `crypto_transfer_hash_request_to_engine' arm-linux-gnueabi-ld: drivers/crypto/omap-sham.o: in function `omap_sham_final': omap-sham.c:(.text+0x1020): undefined reference to `crypto_transfer_hash_request_to_engine' Fixes: 133c3d434d91 ("crypto: omap-sham - convert to use crypto engine") Signed-off-by: Arnd Bergmann Signed-off-by: Herbert Xu --- drivers/crypto/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index bbd51703e738..e535f28a8028 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -366,6 +366,7 @@ if CRYPTO_DEV_OMAP config CRYPTO_DEV_OMAP_SHAM tristate "Support for OMAP MD5/SHA1/SHA2 hw accelerator" depends on ARCH_OMAP2PLUS + select CRYPTO_ENGINE select CRYPTO_SHA1 select CRYPTO_MD5 select CRYPTO_SHA256 From 35d0b389f3b23439ad15b610d6e43fc72fc75779 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 5 Jan 2021 11:32:43 -0700 Subject: [PATCH 038/241] task_work: unconditionally run task_work from get_signal() Song reported a boot regression in a kvm image with 5.11-rc, and bisected it down to the below patch. Debugging this issue, turns out that the boot stalled when a task is waiting on a pipe being released. As we no longer run task_work from get_signal() unless it's queued with TWA_SIGNAL, the task goes idle without running the task_work. This prevents ->release() from being called on the pipe, which another boot task is waiting on. For now, re-instate the unconditional task_work run from get_signal(). For 5.12, we'll collapse TWA_RESUME and TWA_SIGNAL, as it no longer makes sense to have a distinction between the two. This will turn task_work notification into a simple boolean, whether to notify or not. Fixes: 98b89b649fce ("signal: kill JOBCTL_TASK_WORK") Reported-by: Song Liu Tested-by: John Stultz Tested-by: Douglas Anderson Tested-by: Sedat Dilek # LLVM/Clang version 11.0.1 Signed-off-by: Jens Axboe --- kernel/signal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/signal.c b/kernel/signal.c index 5736c55aaa1a..6b9c431da08f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2550,6 +2550,9 @@ bool get_signal(struct ksignal *ksig) struct signal_struct *signal = current->signal; int signr; + if (unlikely(current->task_works)) + task_work_run(); + /* * For non-generic architectures, check for TIF_NOTIFY_SIGNAL so * that the arch handlers don't all have to do it. If we get here From 70b6ff35d62050d1573876cc0e1e078acd3e6008 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Jan 2021 23:47:41 +0100 Subject: [PATCH 039/241] cfg80211/mac80211: fix kernel-doc for SAR APIs A stray @ caused the kernel-doc parser to not understand this, fix that. Also add some missing kernel-doc. Reported-by: Stephen Rothwell Fixes: 6bdb68cef7bf ("nl80211: add common API to configure SAR power limitations") Fixes: c534e093d865 ("mac80211: add ieee80211_set_sar_specs") Signed-off-by: Johannes Berg Tested-by: Stephen Rothwell # build only Link: https://lore.kernel.org/r/20210106234740.96827c18f9bd.I8b9f0a9cbfe186931ef9640046f414371f216914@changeid Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 5 ++++- include/net/mac80211.h | 1 + 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 9a4bbccddc7f..1b3954afcda4 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1756,7 +1756,7 @@ struct cfg80211_sar_specs { /** - * @struct cfg80211_sar_chan_ranges - sar frequency ranges + * struct cfg80211_sar_chan_ranges - sar frequency ranges * @start_freq: start range edge frequency * @end_freq: end range edge frequency */ @@ -3972,6 +3972,8 @@ struct mgmt_frame_regs { * This callback may sleep. * @reset_tid_config: Reset TID specific configuration for the peer, for the * given TIDs. This callback may sleep. + * + * @set_sar_specs: Update the SAR (TX power) settings. */ struct cfg80211_ops { int (*suspend)(struct wiphy *wiphy, struct cfg80211_wowlan *wow); @@ -4929,6 +4931,7 @@ struct wiphy_iftype_akm_suites { * @max_data_retry_count: maximum supported per TID retry count for * configuration through the %NL80211_TID_CONFIG_ATTR_RETRY_SHORT and * %NL80211_TID_CONFIG_ATTR_RETRY_LONG attributes + * @sar_capa: SAR control capabilities */ struct wiphy { /* assign these fields before you register the wiphy */ diff --git a/include/net/mac80211.h b/include/net/mac80211.h index d315740581f1..2bdbf62f4ecd 100644 --- a/include/net/mac80211.h +++ b/include/net/mac80211.h @@ -3880,6 +3880,7 @@ enum ieee80211_reconfig_type { * This callback may sleep. * @sta_set_4addr: Called to notify the driver when a station starts/stops using * 4-address mode + * @set_sar_specs: Update the SAR (TX power) settings. */ struct ieee80211_ops { void (*tx)(struct ieee80211_hw *hw, From 51d62f2f2c501a93d9a6a46f43731f984e227764 Mon Sep 17 00:00:00 2001 From: Ilan Peer Date: Tue, 5 Jan 2021 16:56:57 +0200 Subject: [PATCH 040/241] cfg80211: Save the regulatory domain with a lock Saving the regulatory domain while setting custom regulatory domain was done while accessing a RCU protected pointer but without any protection. Fix this by using RTNL while accessing the pointer. Signed-off-by: Ilan Peer Reported-by: syzbot+27771d4abcd9b7a1f5d3@syzkaller.appspotmail.com Reported-by: syzbot+db4035751c56c0079282@syzkaller.appspotmail.com Reported-by: Hans de Goede Fixes: beee24695157 ("cfg80211: Save the regulatory domain when setting custom regulatory") Signed-off-by: Luca Coelho Link: https://lore.kernel.org/r/iwlwifi.20210105165657.613e9a876829.Ia38d27dbebea28bf9c56d70691d243186ede70e7@changeid Signed-off-by: Johannes Berg --- net/wireless/reg.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/net/wireless/reg.c b/net/wireless/reg.c index bb72447ad960..8114bba8556c 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -5,7 +5,7 @@ * Copyright 2008-2011 Luis R. Rodriguez * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright 2017 Intel Deutschland GmbH - * Copyright (C) 2018 - 2019 Intel Corporation + * Copyright (C) 2018 - 2021 Intel Corporation * * Permission to use, copy, modify, and/or distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -139,6 +139,11 @@ static const struct ieee80211_regdomain *get_cfg80211_regdom(void) return rcu_dereference_rtnl(cfg80211_regdomain); } +/* + * Returns the regulatory domain associated with the wiphy. + * + * Requires either RTNL or RCU protection + */ const struct ieee80211_regdomain *get_wiphy_regdom(struct wiphy *wiphy) { return rcu_dereference_rtnl(wiphy->regd); @@ -2571,9 +2576,13 @@ void wiphy_apply_custom_regulatory(struct wiphy *wiphy, if (IS_ERR(new_regd)) return; + rtnl_lock(); + tmp = get_wiphy_regdom(wiphy); rcu_assign_pointer(wiphy->regd, new_regd); rcu_free_regdom(tmp); + + rtnl_unlock(); } EXPORT_SYMBOL(wiphy_apply_custom_regulatory); From 0378c625afe80eb3f212adae42cc33c9f6f31abf Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Wed, 6 Jan 2021 18:19:05 -0500 Subject: [PATCH 041/241] dm: eliminate potential source of excessive kernel log noise There wasn't ever a real need to log an error in the kernel log for ioctls issued with insufficient permissions. Simply return an error and if an admin/user is sufficiently motivated they can enable DM's dynamic debugging to see an explanation for why the ioctls were disallowed. Reported-by: Nir Soffer Fixes: e980f62353c6 ("dm: don't allow ioctls to targets that don't map to whole devices") Signed-off-by: Mike Snitzer --- drivers/md/dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b3c3c8b4cb42..7bac564f3faa 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -562,7 +562,7 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, * subset of the parent bdev; require extra privileges. */ if (!capable(CAP_SYS_RAWIO)) { - DMWARN_LIMIT( + DMDEBUG_LIMIT( "%s: sending ioctl %x to DM device without required privilege.", current->comm, cmd); r = -ENOIOCTLCMD; From 9b5948267adc9e689da609eb61cf7ed49cae5fa8 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 8 Jan 2021 11:15:56 -0500 Subject: [PATCH 042/241] dm integrity: fix flush with external metadata device With external metadata device, flush requests are not passed down to the data device. Fix this by submitting the flush request in dm_integrity_flush_buffers. In order to not degrade performance, we overlap the data device flush with the metadata device flush. Reported-by: Lukas Straub Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Mike Snitzer --- drivers/md/dm-bufio.c | 6 ++++ drivers/md/dm-integrity.c | 60 ++++++++++++++++++++++++++++++++------- include/linux/dm-bufio.h | 1 + 3 files changed, 56 insertions(+), 11 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 9c1a86bde658..fce4cbf9529d 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -1534,6 +1534,12 @@ sector_t dm_bufio_get_device_size(struct dm_bufio_client *c) } EXPORT_SYMBOL_GPL(dm_bufio_get_device_size); +struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c) +{ + return c->dm_io; +} +EXPORT_SYMBOL_GPL(dm_bufio_get_dm_io_client); + sector_t dm_bufio_get_block_number(struct dm_buffer *b) { return b->block; diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 5a7a1b90e671..11c7c538f7a9 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -1379,12 +1379,52 @@ thorough_test: #undef MAY_BE_HASH } -static void dm_integrity_flush_buffers(struct dm_integrity_c *ic) +struct flush_request { + struct dm_io_request io_req; + struct dm_io_region io_reg; + struct dm_integrity_c *ic; + struct completion comp; +}; + +static void flush_notify(unsigned long error, void *fr_) +{ + struct flush_request *fr = fr_; + if (unlikely(error != 0)) + dm_integrity_io_error(fr->ic, "flusing disk cache", -EIO); + complete(&fr->comp); +} + +static void dm_integrity_flush_buffers(struct dm_integrity_c *ic, bool flush_data) { int r; + + struct flush_request fr; + + if (!ic->meta_dev) + flush_data = false; + if (flush_data) { + fr.io_req.bi_op = REQ_OP_WRITE, + fr.io_req.bi_op_flags = REQ_PREFLUSH | REQ_SYNC, + fr.io_req.mem.type = DM_IO_KMEM, + fr.io_req.mem.ptr.addr = NULL, + fr.io_req.notify.fn = flush_notify, + fr.io_req.notify.context = &fr; + fr.io_req.client = dm_bufio_get_dm_io_client(ic->bufio), + fr.io_reg.bdev = ic->dev->bdev, + fr.io_reg.sector = 0, + fr.io_reg.count = 0, + fr.ic = ic; + init_completion(&fr.comp); + r = dm_io(&fr.io_req, 1, &fr.io_reg, NULL); + BUG_ON(r); + } + r = dm_bufio_write_dirty_buffers(ic->bufio); if (unlikely(r)) dm_integrity_io_error(ic, "writing tags", r); + + if (flush_data) + wait_for_completion(&fr.comp); } static void sleep_on_endio_wait(struct dm_integrity_c *ic) @@ -2110,7 +2150,7 @@ offload_to_thread: if (unlikely(dio->op == REQ_OP_DISCARD) && likely(ic->mode != 'D')) { integrity_metadata(&dio->work); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, false); dio->in_flight = (atomic_t)ATOMIC_INIT(1); dio->completion = NULL; @@ -2195,7 +2235,7 @@ static void integrity_commit(struct work_struct *w) flushes = bio_list_get(&ic->flush_bio_list); if (unlikely(ic->mode != 'J')) { spin_unlock_irq(&ic->endio_wait.lock); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); goto release_flush_bios; } @@ -2409,7 +2449,7 @@ skip_io: complete_journal_op(&comp); wait_for_completion_io(&comp.comp); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); } static void integrity_writer(struct work_struct *w) @@ -2451,7 +2491,7 @@ static void recalc_write_super(struct dm_integrity_c *ic) { int r; - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, false); if (dm_integrity_failed(ic)) return; @@ -2654,7 +2694,7 @@ static void bitmap_flush_work(struct work_struct *work) unsigned long limit; struct bio *bio; - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, false); range.logical_sector = 0; range.n_sectors = ic->provided_data_sectors; @@ -2663,9 +2703,7 @@ static void bitmap_flush_work(struct work_struct *work) add_new_range_and_wait(ic, &range); spin_unlock_irq(&ic->endio_wait.lock); - dm_integrity_flush_buffers(ic); - if (ic->meta_dev) - blkdev_issue_flush(ic->dev->bdev, GFP_NOIO); + dm_integrity_flush_buffers(ic, true); limit = ic->provided_data_sectors; if (ic->sb->flags & cpu_to_le32(SB_FLAG_RECALCULATING)) { @@ -2934,11 +2972,11 @@ static void dm_integrity_postsuspend(struct dm_target *ti) if (ic->meta_dev) queue_work(ic->writer_wq, &ic->writer_work); drain_workqueue(ic->writer_wq); - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); } if (ic->mode == 'B') { - dm_integrity_flush_buffers(ic); + dm_integrity_flush_buffers(ic, true); #if 1 /* set to 0 to test bitmap replay code */ init_journal(ic, 0, ic->journal_sections, 0); diff --git a/include/linux/dm-bufio.h b/include/linux/dm-bufio.h index 29d255fdd5d6..90bd558a17f5 100644 --- a/include/linux/dm-bufio.h +++ b/include/linux/dm-bufio.h @@ -150,6 +150,7 @@ void dm_bufio_set_minimum_buffers(struct dm_bufio_client *c, unsigned n); unsigned dm_bufio_get_block_size(struct dm_bufio_client *c); sector_t dm_bufio_get_device_size(struct dm_bufio_client *c); +struct dm_io_client *dm_bufio_get_dm_io_client(struct dm_bufio_client *c); sector_t dm_bufio_get_block_number(struct dm_buffer *b); void *dm_bufio_get_block_data(struct dm_buffer *b); void *dm_bufio_get_aux_data(struct dm_buffer *b); From 0ea02c73775277001c651ad4a0e83781a9acf406 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 11 Nov 2020 19:52:16 +0800 Subject: [PATCH 043/241] riscv: Drop a duplicated PAGE_KERNEL_EXEC commit b91540d52a08 ("RISC-V: Add EFI runtime services") add a duplicated PAGE_KERNEL_EXEC, kill it. Signed-off-by: Kefeng Wang Reviewed-by: Pekka Enberg Reviewed-by: Atish Patra Fixes: b91540d52a08 ("RISC-V: Add EFI runtime services") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/pgtable.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 41a72861987c..251e1db088fa 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -99,7 +99,6 @@ | _PAGE_DIRTY) #define PAGE_KERNEL __pgprot(_PAGE_KERNEL) -#define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC) #define PAGE_KERNEL_READ __pgprot(_PAGE_KERNEL & ~_PAGE_WRITE) #define PAGE_KERNEL_EXEC __pgprot(_PAGE_KERNEL | _PAGE_EXEC) #define PAGE_KERNEL_READ_EXEC __pgprot((_PAGE_KERNEL & ~_PAGE_WRITE) \ From a0adc8eabb402cfb9f32d15edd9f65f65e35cdce Mon Sep 17 00:00:00 2001 From: John Stultz Date: Thu, 7 Jan 2021 20:26:16 +0000 Subject: [PATCH 044/241] dma-buf: cma_heap: Fix memory leak in CMA heap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bing Song noticed the CMA heap was leaking memory due to a flub I made in commit a5d2d29e24be ("dma-buf: heaps: Move heap-helper logic into the cma_heap implementation"), and provided this fix which ensures the pagelist is also freed on release. Cc: Bing Song Cc: Sumit Semwal Cc: Liam Mark Cc: Laura Abbott Cc: Brian Starkey Cc: Hridya Valsaraju Cc: Suren Baghdasaryan Cc: Sandeep Patil Cc: Daniel Mentz Cc: Chris Goldsworthy Cc: Ørjan Eide Cc: Robin Murphy Cc: Ezequiel Garcia Cc: Simon Ser Cc: James Jones Cc: linux-media@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Reported-by: Bing Song Fixes: a5d2d29e24be ("dma-buf: heaps: Move heap-helper logic into the cma_heap implementation") Signed-off-by: John Stultz Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/20210107202616.75170-1-john.stultz@linaro.org --- drivers/dma-buf/heaps/cma_heap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/dma-buf/heaps/cma_heap.c b/drivers/dma-buf/heaps/cma_heap.c index 3c4e34301172..364fc2f3e499 100644 --- a/drivers/dma-buf/heaps/cma_heap.c +++ b/drivers/dma-buf/heaps/cma_heap.c @@ -251,6 +251,9 @@ static void cma_heap_dma_buf_release(struct dma_buf *dmabuf) buffer->vaddr = NULL; } + /* free page list */ + kfree(buffer->pages); + /* release memory */ cma_release(cma_heap->cma, buffer->cma_pages, buffer->pagecount); kfree(buffer); } From 00cb645fd7e29bdd20967cd20fa8f77bcdf422f9 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 18 Nov 2020 13:40:58 +0100 Subject: [PATCH 045/241] drm/i915/dsi: Use unconditional msleep for the panel_on_delay when there is no reset-deassert MIPI-sequence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 25b4620ee822 ("drm/i915/dsi: Skip delays for v3 VBTs in vid-mode") added an intel_dsi_msleep() helper which skips sleeping if the MIPI-sequences have a version of 3 or newer and the panel is in vid-mode; and it moved a bunch of msleep-s over to this new helper. This was based on my reading of the big comment around line 730 which starts with "Panel enable/disable sequences from the VBT spec.", where the "v3 video mode seq" column does not have any wait t# entries. Given that this code has been used on a lot of different devices without issues until now, it seems that my interpretation of the spec here is mostly correct. But now I have encountered one device, an Acer Aspire Switch 10 E SW3-016, where the panel will not light up unless we do actually honor the panel_on_delay after exexuting the MIPI_SEQ_PANEL_ON sequence. What seems to set this model apart is that it is lacking a MIPI_SEQ_DEASSERT_RESET sequence, which is where the power-on delay usually happens. Fix the panel not lighting up on this model by using an unconditional msleep(panel_on_delay) instead of intel_dsi_msleep() when there is no MIPI_SEQ_DEASSERT_RESET sequence. Fixes: 25b4620ee822 ("drm/i915/dsi: Skip delays for v3 VBTs in vid-mode") Signed-off-by: Hans de Goede Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20201118124058.26021-1-hdegoede@redhat.com (cherry picked from commit 6fdb335f1c9c0845b50625de1624d8445c4c4a07) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/vlv_dsi.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/vlv_dsi.c b/drivers/gpu/drm/i915/display/vlv_dsi.c index d52f9c177908..f94025ec603a 100644 --- a/drivers/gpu/drm/i915/display/vlv_dsi.c +++ b/drivers/gpu/drm/i915/display/vlv_dsi.c @@ -812,10 +812,20 @@ static void intel_dsi_pre_enable(struct intel_atomic_state *state, intel_dsi_prepare(encoder, pipe_config); intel_dsi_vbt_exec_sequence(intel_dsi, MIPI_SEQ_POWER_ON); - intel_dsi_msleep(intel_dsi, intel_dsi->panel_on_delay); - /* Deassert reset */ - intel_dsi_vbt_exec_sequence(intel_dsi, MIPI_SEQ_DEASSERT_RESET); + /* + * Give the panel time to power-on and then deassert its reset. + * Depending on the VBT MIPI sequences version the deassert-seq + * may contain the necessary delay, intel_dsi_msleep() will skip + * the delay in that case. If there is no deassert-seq, then an + * unconditional msleep is used to give the panel time to power-on. + */ + if (dev_priv->vbt.dsi.sequence[MIPI_SEQ_DEASSERT_RESET]) { + intel_dsi_msleep(intel_dsi, intel_dsi->panel_on_delay); + intel_dsi_vbt_exec_sequence(intel_dsi, MIPI_SEQ_DEASSERT_RESET); + } else { + msleep(intel_dsi->panel_on_delay); + } if (IS_GEMINILAKE(dev_priv)) { glk_cold_boot = glk_dsi_enable_io(encoder); From 057fe3535eb35696ad5a849d01d61efa930d2182 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 4 Jan 2021 20:39:05 +0000 Subject: [PATCH 046/241] drm/i915: Disable RPM wakeref assertions during driver shutdown MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As with the regular suspend paths, also disable the wakeref assertions as we disable the driver during shutdown. Reported-by: Hans de Goede Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2899 Fixes: fe0f1e3bfdfe ("drm/i915: Shut down displays gracefully on reboot") Signed-off-by: Chris Wilson Cc: Ville Syrjälä Cc: Hans de Goede Tested-by: Hans de Goede Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20210104203905.19248-1-chris@chris-wilson.co.uk (cherry picked from commit 19fe4ac6f0e7163daf9375a4d39947389ae465fa) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/i915_drv.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/i915/i915_drv.c b/drivers/gpu/drm/i915/i915_drv.c index 88ad754962af..99eb0d7bbc44 100644 --- a/drivers/gpu/drm/i915/i915_drv.c +++ b/drivers/gpu/drm/i915/i915_drv.c @@ -1047,6 +1047,8 @@ static void intel_shutdown_encoders(struct drm_i915_private *dev_priv) void i915_driver_shutdown(struct drm_i915_private *i915) { + disable_rpm_wakeref_asserts(&i915->runtime_pm); + i915_gem_suspend(i915); drm_kms_helper_poll_disable(&i915->drm); @@ -1060,6 +1062,8 @@ void i915_driver_shutdown(struct drm_i915_private *i915) intel_suspend_encoders(i915); intel_shutdown_encoders(i915); + + enable_rpm_wakeref_asserts(&i915->runtime_pm); } static bool suspend_to_idle(struct drm_i915_private *dev_priv) From bb83d5fb550bb7db75b29e6342417fda2bbb691c Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Fri, 8 Jan 2021 17:28:41 +0200 Subject: [PATCH 047/241] drm/i915/backlight: fix CPU mode backlight takeover on LPT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The pch_get_backlight(), lpt_get_backlight(), and lpt_set_backlight() functions operate directly on the hardware registers. If inverting the value is needed, using intel_panel_compute_brightness(), it should only be done in the interface between hardware registers and panel->backlight.level. The CPU mode takeover code added in commit 5b1ec9ac7ab5 ("drm/i915/backlight: Fix backlight takeover on LPT, v3.") reads the hardware register and converts to panel->backlight.level correctly, however the value written back should remain in the hardware register "domain". This hasn't been an issue, because GM45 machines are the only known users of i915.invert_brightness and the brightness invert quirk, and without one of them no conversion is made. It's likely nobody's ever hit the problem. Fixes: 5b1ec9ac7ab5 ("drm/i915/backlight: Fix backlight takeover on LPT, v3.") Cc: Maarten Lankhorst Cc: Ville Syrjälä Cc: Lyude Paul Cc: # v5.1+ Reviewed-by: Lyude Paul Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20210108152841.6944-1-jani.nikula@intel.com (cherry picked from commit 0d4ced1c5bfe649196877d90442d4fd618e19153) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_panel.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_panel.c b/drivers/gpu/drm/i915/display/intel_panel.c index 9f23bac0d792..d64fce1a17cb 100644 --- a/drivers/gpu/drm/i915/display/intel_panel.c +++ b/drivers/gpu/drm/i915/display/intel_panel.c @@ -1650,16 +1650,13 @@ static int lpt_setup_backlight(struct intel_connector *connector, enum pipe unus val = pch_get_backlight(connector); else val = lpt_get_backlight(connector); - val = intel_panel_compute_brightness(connector, val); - panel->backlight.level = clamp(val, panel->backlight.min, - panel->backlight.max); if (cpu_mode) { drm_dbg_kms(&dev_priv->drm, "CPU backlight register was enabled, switching to PCH override\n"); /* Write converted CPU PWM value to PCH override register */ - lpt_set_backlight(connector->base.state, panel->backlight.level); + lpt_set_backlight(connector->base.state, val); intel_de_write(dev_priv, BLC_PWM_PCH_CTL1, pch_ctl1 | BLM_PCH_OVERRIDE_ENABLE); @@ -1667,6 +1664,10 @@ static int lpt_setup_backlight(struct intel_connector *connector, enum pipe unus cpu_ctl2 & ~BLM_PWM_ENABLE); } + val = intel_panel_compute_brightness(connector, val); + panel->backlight.level = clamp(val, panel->backlight.min, + panel->backlight.max); + return 0; } From d434ab6db524ab1efd0afad4ffa1ee65ca6ac097 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 11 Jan 2021 04:00:30 +0000 Subject: [PATCH 048/241] io_uring: drop mm and files after task_work_run __io_req_task_submit() run by task_work can set mm and files, but io_sq_thread() in some cases, and because __io_sq_thread_acquire_mm() and __io_sq_thread_acquire_files() do a simple current->mm/files check it may end up submitting IO with mm/files of another task. We also need to drop it after in the end to drop potentially grabbed references to them. Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 2f305c097bd5..7af74c1ec909 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -7056,6 +7056,7 @@ static int io_sq_thread(void *data) if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); + io_sq_thread_drop_mm_files(); cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; @@ -7093,6 +7094,7 @@ static int io_sq_thread(void *data) } io_run_task_work(); + io_sq_thread_drop_mm_files(); if (cur_css) io_sq_thread_unassociate_blkcg(); From 621fadc22365f3cf307bcd9048e3372e9ee9cdcc Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 11 Jan 2021 04:00:31 +0000 Subject: [PATCH 049/241] io_uring: don't take files/mm for a dead task In rare cases a task may be exiting while io_ring_exit_work() trying to cancel/wait its requests. It's ok for __io_sq_thread_acquire_mm() because of SQPOLL check, but is not for __io_sq_thread_acquire_files(). Play safe and fail for both of them. Cc: stable@vger.kernel.org # 5.5+ Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 7af74c1ec909..b0e6d8e607a3 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -1106,6 +1106,9 @@ static void io_sq_thread_drop_mm_files(void) static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) { + if (current->flags & PF_EXITING) + return -EFAULT; + if (!current->files) { struct files_struct *files; struct nsproxy *nsproxy; @@ -1133,6 +1136,8 @@ static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) { struct mm_struct *mm; + if (current->flags & PF_EXITING) + return -EFAULT; if (current->mm) return 0; From 2af5268180410b874fc06be91a1b2fbb22b1be0c Mon Sep 17 00:00:00 2001 From: Imre Deak Date: Wed, 9 Dec 2020 17:39:52 +0200 Subject: [PATCH 050/241] drm/i915/icl: Fix initing the DSI DSC power refcount during HW readout For an enabled DSC during HW readout the corresponding power reference is taken along the CRTC power domain references in get_crtc_power_domains(). Remove the incorrect get ref from the DSI encoder hook. Fixes: 2b68392e638d ("drm/i915/dsi: add support for DSC") Cc: Vandita Kulkarni Cc: Jani Nikula Signed-off-by: Imre Deak Reviewed-by: Anshuman Gupta Link: https://patchwork.freedesktop.org/patch/msgid/20201209153952.3397959-1-imre.deak@intel.com (cherry picked from commit 3a9ec563a4ff770ae647f6ee539810f1866866c9) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/icl_dsi.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/gpu/drm/i915/display/icl_dsi.c b/drivers/gpu/drm/i915/display/icl_dsi.c index a9439b415603..b3533a32f8ba 100644 --- a/drivers/gpu/drm/i915/display/icl_dsi.c +++ b/drivers/gpu/drm/i915/display/icl_dsi.c @@ -1616,10 +1616,6 @@ static void gen11_dsi_get_power_domains(struct intel_encoder *encoder, get_dsi_io_power_domains(i915, enc_to_intel_dsi(encoder)); - - if (crtc_state->dsc.compression_enable) - intel_display_power_get(i915, - intel_dsc_power_domain(crtc_state)); } static bool gen11_dsi_get_hw_state(struct intel_encoder *encoder, From a58015d638cd4e4555297b04bec9b49028369075 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Thu, 7 Jan 2021 23:23:48 -0800 Subject: [PATCH 051/241] ACPI: scan: Harden acpi_device_add() against device ID overflows Linux VM on Hyper-V crashes with the latest mainline: [ 4.069624] detected buffer overflow in strcpy [ 4.077733] kernel BUG at lib/string.c:1149! .. [ 4.085819] RIP: 0010:fortify_panic+0xf/0x11 ... [ 4.085819] Call Trace: [ 4.085819] acpi_device_add.cold.15+0xf2/0xfb [ 4.085819] acpi_add_single_object+0x2a6/0x690 [ 4.085819] acpi_bus_check_add+0xc6/0x280 [ 4.085819] acpi_ns_walk_namespace+0xda/0x1aa [ 4.085819] acpi_walk_namespace+0x9a/0xc2 [ 4.085819] acpi_bus_scan+0x78/0x90 [ 4.085819] acpi_scan_init+0xfa/0x248 [ 4.085819] acpi_init+0x2c1/0x321 [ 4.085819] do_one_initcall+0x44/0x1d0 [ 4.085819] kernel_init_freeable+0x1ab/0x1f4 This is because of the recent buffer overflow detection in the commit 6a39e62abbaf ("lib: string.h: detect intra-object overflow in fortified string functions") Here acpi_device_bus_id->bus_id can only hold 14 characters, while the the acpi_device_hid(device) returns a 22-char string "HYPER_V_GEN_COUNTER_V1". Per ACPI Spec v6.2, Section 6.1.5 _HID (Hardware ID), if the ID is a string, it must be of the form AAA#### or NNNN####, i.e. 7 chars or 8 chars. The field bus_id in struct acpi_device_bus_id was originally defined as char bus_id[9], and later was enlarged to char bus_id[15] in 2007 in the commit bb0958544f3c ("ACPI: use more understandable bus_id for ACPI devices") Fix the issue by changing the field bus_id to const char *, and use kstrdup_const() to initialize it. Signed-off-by: Dexuan Cui Tested-By: Jethro Beekman [ rjw: Subject change, whitespace adjustment ] Cc: All applicable Signed-off-by: Rafael J. Wysocki --- drivers/acpi/internal.h | 2 +- drivers/acpi/scan.c | 15 ++++++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/internal.h b/drivers/acpi/internal.h index cb229e24c563..e6a5d997241c 100644 --- a/drivers/acpi/internal.h +++ b/drivers/acpi/internal.h @@ -97,7 +97,7 @@ void acpi_scan_table_handler(u32 event, void *table, void *context); extern struct list_head acpi_bus_id_list; struct acpi_device_bus_id { - char bus_id[15]; + const char *bus_id; unsigned int instance_no; struct list_head node; }; diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 80b668c80073..58ff36340cd7 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -486,6 +486,7 @@ static void acpi_device_del(struct acpi_device *device) acpi_device_bus_id->instance_no--; else { list_del(&acpi_device_bus_id->node); + kfree_const(acpi_device_bus_id->bus_id); kfree(acpi_device_bus_id); } break; @@ -674,7 +675,14 @@ int acpi_device_add(struct acpi_device *device, } if (!found) { acpi_device_bus_id = new_bus_id; - strcpy(acpi_device_bus_id->bus_id, acpi_device_hid(device)); + acpi_device_bus_id->bus_id = + kstrdup_const(acpi_device_hid(device), GFP_KERNEL); + if (!acpi_device_bus_id->bus_id) { + pr_err(PREFIX "Memory allocation error for bus id\n"); + result = -ENOMEM; + goto err_free_new_bus_id; + } + acpi_device_bus_id->instance_no = 0; list_add_tail(&acpi_device_bus_id->node, &acpi_bus_id_list); } @@ -709,6 +717,11 @@ int acpi_device_add(struct acpi_device *device, if (device->parent) list_del(&device->node); list_del(&device->wakeup_list); + + err_free_new_bus_id: + if (!found) + kfree(new_bus_id); + mutex_unlock(&acpi_device_lock); err_detach: From 843010a815e87b45fc6b64848f02e42f6aee3f22 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Mon, 11 Jan 2021 11:40:33 -0500 Subject: [PATCH 052/241] drm/ttm: Fix address passed to dma_mapping_error() in ttm_pool_map() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit check_unmap() is producing a warning about a missing map error check. The return value from dma_map_page() should be checked for an error, not the caller-provided dma_addr. Fixes: d099fc8f540a ("drm/ttm: new TT backend allocation pool v3") Signed-off-by: Jeremy Cline Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/413432/ Signed-off-by: Christian König --- drivers/gpu/drm/ttm/ttm_pool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index a00b7ab9c14c..255c45734979 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -190,7 +190,7 @@ static int ttm_pool_map(struct ttm_pool *pool, unsigned int order, size_t size = (1ULL << order) * PAGE_SIZE; addr = dma_map_page(pool->dev, p, 0, size, DMA_BIDIRECTIONAL); - if (dma_mapping_error(pool->dev, **dma_addr)) + if (dma_mapping_error(pool->dev, addr)) return -EFAULT; } From a5e92ef3c3fd46320d4e293bdec0cdd4b80a6e0f Mon Sep 17 00:00:00 2001 From: Bas Nieuwenhuizen Date: Sun, 10 Jan 2021 03:11:42 +0100 Subject: [PATCH 053/241] drm: Check actual format for legacy pageflip. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With modifiers one can actually have different format_info structs for the same format, which now matters for AMDGPU since we convert implicit modifiers to explicit modifiers with multiple planes. I checked other drivers and it doesn't look like they end up triggering this case so I think this is safe to relax. Signed-off-by: Bas Nieuwenhuizen Reviewed-by: Daniel Vetter Reviewed-by: Zhan Liu Acked-by: Christian König Acked-by: Alex Deucher Fixes: 816853f9dc40 ("drm/amd/display: Set new format info for converted metadata.") Signed-off-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20210110021142.28221-1-bas@basnieuwenhuizen.nl --- drivers/gpu/drm/drm_plane.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_plane.c b/drivers/gpu/drm/drm_plane.c index e6231947f987..a0cb746bcb0a 100644 --- a/drivers/gpu/drm/drm_plane.c +++ b/drivers/gpu/drm/drm_plane.c @@ -1163,7 +1163,14 @@ retry: if (ret) goto out; - if (old_fb->format != fb->format) { + /* + * Only check the FOURCC format code, excluding modifiers. This is + * enough for all legacy drivers. Atomic drivers have their own + * checks in their ->atomic_check implementation, which will + * return -EINVAL if any hw or driver constraint is violated due + * to modifier changes. + */ + if (old_fb->format->format != fb->format->format) { DRM_DEBUG_KMS("Page flip is not allowed to change frame buffer format.\n"); ret = -EINVAL; goto out; From 5541075a348b6ca6ac668653f7d2c423ae8e00b6 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 11 Jan 2021 20:16:50 +0100 Subject: [PATCH 054/241] bpf: Prevent double bpf_prog_put call from bpf_tracing_prog_attach MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bpf_tracing_prog_attach error path calls bpf_prog_put on prog, which causes refcount underflow when it's called from link_create function. link_create prog = bpf_prog_get <-- get ... tracing_bpf_link_attach(prog.. bpf_tracing_prog_attach(prog.. out_put_prog: bpf_prog_put(prog); <-- put if (ret < 0) bpf_prog_put(prog); <-- put Removing bpf_prog_put call from bpf_tracing_prog_attach and making sure its callers call it instead. Fixes: 4a1e7c0c63e0 ("bpf: Support attaching freplace programs to multiple attach points") Signed-off-by: Jiri Olsa Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20210111191650.1241578-1-jolsa@kernel.org --- kernel/bpf/syscall.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index c3bb03c8371f..e5999d86c76e 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -2712,7 +2712,6 @@ out_unlock: out_put_prog: if (tgt_prog_fd && tgt_prog) bpf_prog_put(tgt_prog); - bpf_prog_put(prog); return err; } @@ -2825,7 +2824,10 @@ static int bpf_raw_tracepoint_open(const union bpf_attr *attr) tp_name = prog->aux->attach_func_name; break; } - return bpf_tracing_prog_attach(prog, 0, 0); + err = bpf_tracing_prog_attach(prog, 0, 0); + if (err >= 0) + return err; + goto out_put_prog; case BPF_PROG_TYPE_RAW_TRACEPOINT: case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: if (strncpy_from_user(buf, From 2225a8dda263edc35a0e8b858fe2945cf6240fde Mon Sep 17 00:00:00 2001 From: Ariel Marcovitch Date: Sat, 2 Jan 2021 22:11:56 +0200 Subject: [PATCH 055/241] powerpc: Fix alignment bug within the init sections This is a bug that causes early crashes in builds with an .exit.text section smaller than a page and an .init.text section that ends in the beginning of a physical page (this is kinda random, which might explain why this wasn't really encountered before). The init sections are ordered like this: .init.text .exit.text .init.data Currently, these sections aren't page aligned. Because the init code might become read-only at runtime and because the .init.text section can potentially reside on the same physical page as .init.data, the beginning of .init.data might be mapped read-only along with .init.text. Then when the kernel tries to modify a variable in .init.data (like kthreadd_done, used in kernel_init()) the kernel panics. To avoid this, make _einittext page aligned and also align .exit.text to make sure .init.data is always seperated from the text segments. Fixes: 060ef9d89d18 ("powerpc32: PAGE_EXEC required for inittext") Signed-off-by: Ariel Marcovitch Reviewed-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20210102201156.10805-1-ariel.marcovitch@gmail.com --- arch/powerpc/kernel/vmlinux.lds.S | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8e0b1298bf19..4ab426b8b0e0 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -187,6 +187,12 @@ SECTIONS .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { _sinittext = .; INIT_TEXT + + /* + *.init.text might be RO so we must ensure this section ends on + * a page boundary. + */ + . = ALIGN(PAGE_SIZE); _einittext = .; #ifdef CONFIG_PPC64 *(.tramp.ftrace.init); @@ -200,6 +206,8 @@ SECTIONS EXIT_TEXT } + . = ALIGN(PAGE_SIZE); + INIT_DATA_SECTION(16) . = ALIGN(8); From bb52cb0dec8d2fecdb22843a805131478a180728 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20K=C3=B6nig?= Date: Mon, 11 Jan 2021 14:44:57 +0100 Subject: [PATCH 056/241] drm/ttm: make the pool shrinker lock a mutex MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit set_pages_wb() might sleep and so we can't do this in an atomic context. Signed-off-by: Christian König Reported-by: Mikhail Gavrilov Tested-by: Mikhail Gavrilov Fixes: d099fc8f540a ("drm/ttm: new TT backend allocation pool v3") Reviewed-by: Huang Rui Link: https://patchwork.freedesktop.org/patch/413409/ --- drivers/gpu/drm/ttm/ttm_pool.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_pool.c b/drivers/gpu/drm/ttm/ttm_pool.c index 255c45734979..8cd776adc592 100644 --- a/drivers/gpu/drm/ttm/ttm_pool.c +++ b/drivers/gpu/drm/ttm/ttm_pool.c @@ -66,7 +66,7 @@ static struct ttm_pool_type global_uncached[MAX_ORDER]; static struct ttm_pool_type global_dma32_write_combined[MAX_ORDER]; static struct ttm_pool_type global_dma32_uncached[MAX_ORDER]; -static spinlock_t shrinker_lock; +static struct mutex shrinker_lock; static struct list_head shrinker_list; static struct shrinker mm_shrinker; @@ -249,9 +249,9 @@ static void ttm_pool_type_init(struct ttm_pool_type *pt, struct ttm_pool *pool, spin_lock_init(&pt->lock); INIT_LIST_HEAD(&pt->pages); - spin_lock(&shrinker_lock); + mutex_lock(&shrinker_lock); list_add_tail(&pt->shrinker_list, &shrinker_list); - spin_unlock(&shrinker_lock); + mutex_unlock(&shrinker_lock); } /* Remove a pool_type from the global shrinker list and free all pages */ @@ -259,9 +259,9 @@ static void ttm_pool_type_fini(struct ttm_pool_type *pt) { struct page *p, *tmp; - spin_lock(&shrinker_lock); + mutex_lock(&shrinker_lock); list_del(&pt->shrinker_list); - spin_unlock(&shrinker_lock); + mutex_unlock(&shrinker_lock); list_for_each_entry_safe(p, tmp, &pt->pages, lru) ttm_pool_free_page(pt->pool, pt->caching, pt->order, p); @@ -302,7 +302,7 @@ static unsigned int ttm_pool_shrink(void) unsigned int num_freed; struct page *p; - spin_lock(&shrinker_lock); + mutex_lock(&shrinker_lock); pt = list_first_entry(&shrinker_list, typeof(*pt), shrinker_list); p = ttm_pool_type_take(pt); @@ -314,7 +314,7 @@ static unsigned int ttm_pool_shrink(void) } list_move_tail(&pt->shrinker_list, &shrinker_list); - spin_unlock(&shrinker_lock); + mutex_unlock(&shrinker_lock); return num_freed; } @@ -564,7 +564,7 @@ int ttm_pool_debugfs(struct ttm_pool *pool, struct seq_file *m) { unsigned int i; - spin_lock(&shrinker_lock); + mutex_lock(&shrinker_lock); seq_puts(m, "\t "); for (i = 0; i < MAX_ORDER; ++i) @@ -600,7 +600,7 @@ int ttm_pool_debugfs(struct ttm_pool *pool, struct seq_file *m) seq_printf(m, "\ntotal\t: %8lu of %8lu\n", atomic_long_read(&allocated_pages), page_pool_size); - spin_unlock(&shrinker_lock); + mutex_unlock(&shrinker_lock); return 0; } @@ -644,7 +644,7 @@ int ttm_pool_mgr_init(unsigned long num_pages) if (!page_pool_size) page_pool_size = num_pages; - spin_lock_init(&shrinker_lock); + mutex_init(&shrinker_lock); INIT_LIST_HEAD(&shrinker_list); for (i = 0; i < MAX_ORDER; ++i) { From 51b2ee7d006a736a9126e8111d1f24e4fd0afaa6 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Mon, 11 Jan 2021 16:01:29 -0500 Subject: [PATCH 057/241] nfsd4: readdirplus shouldn't return parent of export If you export a subdirectory of a filesystem, a READDIRPLUS on the root of that export will return the filehandle of the parent with the ".." entry. The filehandle is optional, so let's just not return the filehandle for ".." if we're at the root of an export. Note that once the client learns one filehandle outside of the export, they can trivially access the rest of the export using further lookups. However, it is also not very difficult to guess filehandles outside of the export. So exporting a subdirectory of a filesystem should considered equivalent to providing access to the entire filesystem. To avoid confusion, we recommend only exporting entire filesystems. Reported-by: Youjipeng Signed-off-by: J. Bruce Fields Cc: stable@vger.kernel.org Signed-off-by: Chuck Lever --- fs/nfsd/nfs3xdr.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 821db21ba072..34b880211e5e 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -865,9 +865,14 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, if (isdotent(name, namlen)) { if (namlen == 2) { dchild = dget_parent(dparent); - /* filesystem root - cannot return filehandle for ".." */ + /* + * Don't return filehandle for ".." if we're at + * the filesystem or export root: + */ if (dchild == dparent) goto out; + if (dparent == exp->ex_path.dentry) + goto out; } else dchild = dget(dparent); } else From 1a9c72ad4c26821e215a396167c14959cf24a7f1 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 12 Jan 2021 07:55:24 +0000 Subject: [PATCH 058/241] bpf: Local storage helpers should check nullness of owner ptr passed The verifier allows ARG_PTR_TO_BTF_ID helper arguments to be NULL, so helper implementations need to check this before dereferencing them. This was already fixed for the socket storage helpers but not for task and inode. The issue can be reproduced by attaching an LSM program to inode_rename hook (called when moving files) which tries to get the inode of the new file without checking for its nullness and then trying to move an existing file to a new path: mv existing_file new_file_does_not_exist The report including the sample program and the steps for reproducing the bug: https://lore.kernel.org/bpf/CANaYP3HWkH91SN=wTNO9FL_2ztHfqcXKX38SSE-JJ2voh+vssw@mail.gmail.com Fixes: 4cf1bc1f1045 ("bpf: Implement task local storage") Fixes: 8ea636848aca ("bpf: Implement bpf_local_storage for inodes") Reported-by: Gilad Reti Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075525.256820-3-kpsingh@kernel.org --- kernel/bpf/bpf_inode_storage.c | 5 ++++- kernel/bpf/bpf_task_storage.c | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index 6edff97ad594..dbc1dbdd2cbf 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -176,7 +176,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, * bpf_local_storage_update expects the owner to have a * valid storage pointer. */ - if (!inode_storage_ptr(inode)) + if (!inode || !inode_storage_ptr(inode)) return (unsigned long)NULL; sdata = inode_storage_lookup(inode, map, true); @@ -200,6 +200,9 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, BPF_CALL_2(bpf_inode_storage_delete, struct bpf_map *, map, struct inode *, inode) { + if (!inode) + return -EINVAL; + /* This helper must only called from where the inode is gurranteed * to have a refcount and cannot be freed. */ diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c index 4ef1959a78f2..e0da0258b732 100644 --- a/kernel/bpf/bpf_task_storage.c +++ b/kernel/bpf/bpf_task_storage.c @@ -218,7 +218,7 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, * bpf_local_storage_update expects the owner to have a * valid storage pointer. */ - if (!task_storage_ptr(task)) + if (!task || !task_storage_ptr(task)) return (unsigned long)NULL; sdata = task_storage_lookup(task, map, true); @@ -243,6 +243,9 @@ BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *, BPF_CALL_2(bpf_task_storage_delete, struct bpf_map *, map, struct task_struct *, task) { + if (!task) + return -EINVAL; + /* This helper must only be called from places where the lifetime of the task * is guaranteed. Either by being refcounted or by being protected * by an RCU read-side critical section. From 84d571d46c7046a957ff3d1c916a1b9dcc7f1ce8 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 12 Jan 2021 07:55:25 +0000 Subject: [PATCH 059/241] bpf: Fix typo in bpf_inode_storage.c Fix "gurranteed" -> "guaranteed" in bpf_inode_storage.c Suggested-by: Andrii Nakryiko Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075525.256820-4-kpsingh@kernel.org --- kernel/bpf/bpf_inode_storage.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c index dbc1dbdd2cbf..2f0597320b6d 100644 --- a/kernel/bpf/bpf_inode_storage.c +++ b/kernel/bpf/bpf_inode_storage.c @@ -183,7 +183,7 @@ BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode, if (sdata) return (unsigned long)sdata->data; - /* This helper must only called from where the inode is gurranteed + /* This helper must only called from where the inode is guaranteed * to have a refcount and cannot be freed. */ if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) { @@ -203,7 +203,7 @@ BPF_CALL_2(bpf_inode_storage_delete, if (!inode) return -EINVAL; - /* This helper must only called from where the inode is gurranteed + /* This helper must only called from where the inode is guaranteed * to have a refcount and cannot be freed. */ return inode_storage_delete(inode, map); From 2f94ac19184665263b7a285ae88abe19dedf9c1b Mon Sep 17 00:00:00 2001 From: KP Singh Date: Tue, 12 Jan 2021 07:55:23 +0000 Subject: [PATCH 060/241] bpf: Update local storage test to check handling of null ptrs It was found in [1] that bpf_inode_storage_get helper did not check the nullness of the passed owner ptr which caused an oops when dereferenced. This change incorporates the example suggested in [1] into the local storage selftest. The test is updated to create a temporary directory instead of just using a tempfile. In order to replicate the issue this copied rm binary is renamed tiggering the inode_rename with a null pointer for the new_inode. The logic to verify the setting and deletion of the inode local storage of the old inode is also moved to this LSM hook. The change also removes the copy_rm function and simply shells out to copy files and recursively delete directories and consolidates the logic of setting the initial inode storage to the bprm_committed_creds hook and removes the file_open hook. [1]: https://lore.kernel.org/bpf/CANaYP3HWkH91SN=wTNO9FL_2ztHfqcXKX38SSE-JJ2voh+vssw@mail.gmail.com Suggested-by: Gilad Reti Signed-off-by: KP Singh Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112075525.256820-2-kpsingh@kernel.org --- .../bpf/prog_tests/test_local_storage.c | 96 +++++-------------- .../selftests/bpf/progs/local_storage.c | 62 ++++++------ 2 files changed, 61 insertions(+), 97 deletions(-) diff --git a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c index c0fe73a17ed1..3bfcf00c0a67 100644 --- a/tools/testing/selftests/bpf/prog_tests/test_local_storage.c +++ b/tools/testing/selftests/bpf/prog_tests/test_local_storage.c @@ -34,61 +34,6 @@ struct storage { struct bpf_spin_lock lock; }; -/* Copies an rm binary to a temp file. dest is a mkstemp template */ -static int copy_rm(char *dest) -{ - int fd_in, fd_out = -1, ret = 0; - struct stat stat; - char *buf = NULL; - - fd_in = open("/bin/rm", O_RDONLY); - if (fd_in < 0) - return -errno; - - fd_out = mkstemp(dest); - if (fd_out < 0) { - ret = -errno; - goto out; - } - - ret = fstat(fd_in, &stat); - if (ret == -1) { - ret = -errno; - goto out; - } - - buf = malloc(stat.st_blksize); - if (!buf) { - ret = -errno; - goto out; - } - - while (ret = read(fd_in, buf, stat.st_blksize), ret > 0) { - ret = write(fd_out, buf, ret); - if (ret < 0) { - ret = -errno; - goto out; - - } - } - if (ret < 0) { - ret = -errno; - goto out; - - } - - /* Set executable permission on the copied file */ - ret = chmod(dest, 0100); - if (ret == -1) - ret = -errno; - -out: - free(buf); - close(fd_in); - close(fd_out); - return ret; -} - /* Fork and exec the provided rm binary and return the exit code of the * forked process and its pid. */ @@ -168,9 +113,11 @@ static bool check_syscall_operations(int map_fd, int obj_fd) void test_test_local_storage(void) { - char tmp_exec_path[PATH_MAX] = "/tmp/copy_of_rmXXXXXX"; + char tmp_dir_path[64] = "/tmp/local_storageXXXXXX"; int err, serv_sk = -1, task_fd = -1, rm_fd = -1; struct local_storage *skel = NULL; + char tmp_exec_path[64]; + char cmd[256]; skel = local_storage__open_and_load(); if (CHECK(!skel, "skel_load", "lsm skeleton failed\n")) @@ -189,18 +136,24 @@ void test_test_local_storage(void) task_fd)) goto close_prog; - err = copy_rm(tmp_exec_path); - if (CHECK(err < 0, "copy_rm", "err %d errno %d\n", err, errno)) + if (CHECK(!mkdtemp(tmp_dir_path), "mkdtemp", + "unable to create tmpdir: %d\n", errno)) goto close_prog; + snprintf(tmp_exec_path, sizeof(tmp_exec_path), "%s/copy_of_rm", + tmp_dir_path); + snprintf(cmd, sizeof(cmd), "cp /bin/rm %s", tmp_exec_path); + if (CHECK_FAIL(system(cmd))) + goto close_prog_rmdir; + rm_fd = open(tmp_exec_path, O_RDONLY); if (CHECK(rm_fd < 0, "open", "failed to open %s err:%d, errno:%d", tmp_exec_path, rm_fd, errno)) - goto close_prog; + goto close_prog_rmdir; if (!check_syscall_operations(bpf_map__fd(skel->maps.inode_storage_map), rm_fd)) - goto close_prog; + goto close_prog_rmdir; /* Sets skel->bss->monitored_pid to the pid of the forked child * forks a child process that executes tmp_exec_path and tries to @@ -209,33 +162,36 @@ void test_test_local_storage(void) */ err = run_self_unlink(&skel->bss->monitored_pid, tmp_exec_path); if (CHECK(err != EPERM, "run_self_unlink", "err %d want EPERM\n", err)) - goto close_prog_unlink; + goto close_prog_rmdir; /* Set the process being monitored to be the current process */ skel->bss->monitored_pid = getpid(); - /* Remove the temporary created executable */ - err = unlink(tmp_exec_path); - if (CHECK(err != 0, "unlink", "unable to unlink %s: %d", tmp_exec_path, - errno)) - goto close_prog_unlink; + /* Move copy_of_rm to a new location so that it triggers the + * inode_rename LSM hook with a new_dentry that has a NULL inode ptr. + */ + snprintf(cmd, sizeof(cmd), "mv %s/copy_of_rm %s/check_null_ptr", + tmp_dir_path, tmp_dir_path); + if (CHECK_FAIL(system(cmd))) + goto close_prog_rmdir; CHECK(skel->data->inode_storage_result != 0, "inode_storage_result", "inode_local_storage not set\n"); serv_sk = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); if (CHECK(serv_sk < 0, "start_server", "failed to start server\n")) - goto close_prog; + goto close_prog_rmdir; CHECK(skel->data->sk_storage_result != 0, "sk_storage_result", "sk_local_storage not set\n"); if (!check_syscall_operations(bpf_map__fd(skel->maps.sk_storage_map), serv_sk)) - goto close_prog; + goto close_prog_rmdir; -close_prog_unlink: - unlink(tmp_exec_path); +close_prog_rmdir: + snprintf(cmd, sizeof(cmd), "rm -rf %s", tmp_dir_path); + system(cmd); close_prog: close(serv_sk); close(rm_fd); diff --git a/tools/testing/selftests/bpf/progs/local_storage.c b/tools/testing/selftests/bpf/progs/local_storage.c index 3e3de130f28f..95868bc7ada9 100644 --- a/tools/testing/selftests/bpf/progs/local_storage.c +++ b/tools/testing/selftests/bpf/progs/local_storage.c @@ -50,7 +50,6 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; bool is_self_unlink; - int err; if (pid != monitored_pid) return 0; @@ -66,8 +65,27 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) return -EPERM; } - storage = bpf_inode_storage_get(&inode_storage_map, victim->d_inode, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); + return 0; +} + +SEC("lsm/inode_rename") +int BPF_PROG(inode_rename, struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) +{ + __u32 pid = bpf_get_current_pid_tgid() >> 32; + struct local_storage *storage; + int err; + + /* new_dentry->d_inode can be NULL when the inode is renamed to a file + * that did not exist before. The helper should be able to handle this + * NULL pointer. + */ + bpf_inode_storage_get(&inode_storage_map, new_dentry->d_inode, 0, + BPF_LOCAL_STORAGE_GET_F_CREATE); + + storage = bpf_inode_storage_get(&inode_storage_map, old_dentry->d_inode, + 0, 0); if (!storage) return 0; @@ -76,7 +94,7 @@ int BPF_PROG(unlink_hook, struct inode *dir, struct dentry *victim) inode_storage_result = -1; bpf_spin_unlock(&storage->lock); - err = bpf_inode_storage_delete(&inode_storage_map, victim->d_inode); + err = bpf_inode_storage_delete(&inode_storage_map, old_dentry->d_inode); if (!err) inode_storage_result = err; @@ -133,37 +151,18 @@ int BPF_PROG(socket_post_create, struct socket *sock, int family, int type, return 0; } -SEC("lsm/file_open") -int BPF_PROG(file_open, struct file *file) -{ - __u32 pid = bpf_get_current_pid_tgid() >> 32; - struct local_storage *storage; - - if (pid != monitored_pid) - return 0; - - if (!file->f_inode) - return 0; - - storage = bpf_inode_storage_get(&inode_storage_map, file->f_inode, 0, - BPF_LOCAL_STORAGE_GET_F_CREATE); - if (!storage) - return 0; - - bpf_spin_lock(&storage->lock); - storage->value = DUMMY_STORAGE_VALUE; - bpf_spin_unlock(&storage->lock); - return 0; -} - /* This uses the local storage to remember the inode of the binary that a * process was originally executing. */ SEC("lsm/bprm_committed_creds") void BPF_PROG(exec, struct linux_binprm *bprm) { + __u32 pid = bpf_get_current_pid_tgid() >> 32; struct local_storage *storage; + if (pid != monitored_pid) + return; + storage = bpf_task_storage_get(&task_storage_map, bpf_get_current_task_btf(), 0, BPF_LOCAL_STORAGE_GET_F_CREATE); @@ -172,4 +171,13 @@ void BPF_PROG(exec, struct linux_binprm *bprm) storage->exec_inode = bprm->file->f_inode; bpf_spin_unlock(&storage->lock); } + + storage = bpf_inode_storage_get(&inode_storage_map, bprm->file->f_inode, + 0, BPF_LOCAL_STORAGE_GET_F_CREATE); + if (!storage) + return; + + bpf_spin_lock(&storage->lock); + storage->value = DUMMY_STORAGE_VALUE; + bpf_spin_unlock(&storage->lock); } From 2d6ffc63f12417b979955a5b22ad9a76d2af5de9 Mon Sep 17 00:00:00 2001 From: Lu Baolu Date: Thu, 31 Dec 2020 08:53:20 +0800 Subject: [PATCH 061/241] iommu/vt-d: Fix unaligned addresses for intel_flush_svm_range_dev() The VT-d hardware will ignore those Addr bits which have been masked by the AM field in the PASID-based-IOTLB invalidation descriptor. As the result, if the starting address in the descriptor is not aligned with the address mask, some IOTLB caches might not invalidate. Hence people will see below errors. [ 1093.704661] dmar_fault: 29 callbacks suppressed [ 1093.704664] DMAR: DRHD: handling fault status reg 3 [ 1093.712738] DMAR: [DMA Read] Request device [7a:02.0] PASID 2 fault addr 7f81c968d000 [fault reason 113] SM: Present bit in first-level paging entry is clear Fix this by using aligned address for PASID-based-IOTLB invalidation. Fixes: 1c4f88b7f1f9 ("iommu/vt-d: Shared virtual address in scalable mode") Reported-and-tested-by: Guo Kaijie Signed-off-by: Lu Baolu Link: https://lore.kernel.org/r/20201231005323.2178523-2-baolu.lu@linux.intel.com Signed-off-by: Will Deacon --- drivers/iommu/intel/svm.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/drivers/iommu/intel/svm.c b/drivers/iommu/intel/svm.c index 790ef3497e7e..18a9f05df407 100644 --- a/drivers/iommu/intel/svm.c +++ b/drivers/iommu/intel/svm.c @@ -118,8 +118,10 @@ void intel_svm_check(struct intel_iommu *iommu) iommu->flags |= VTD_FLAG_SVM_CAPABLE; } -static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_dev *sdev, - unsigned long address, unsigned long pages, int ih) +static void __flush_svm_range_dev(struct intel_svm *svm, + struct intel_svm_dev *sdev, + unsigned long address, + unsigned long pages, int ih) { struct qi_desc desc; @@ -170,6 +172,22 @@ static void intel_flush_svm_range_dev (struct intel_svm *svm, struct intel_svm_d } } +static void intel_flush_svm_range_dev(struct intel_svm *svm, + struct intel_svm_dev *sdev, + unsigned long address, + unsigned long pages, int ih) +{ + unsigned long shift = ilog2(__roundup_pow_of_two(pages)); + unsigned long align = (1ULL << (VTD_PAGE_SHIFT + shift)); + unsigned long start = ALIGN_DOWN(address, align); + unsigned long end = ALIGN(address + (pages << VTD_PAGE_SHIFT), align); + + while (start < end) { + __flush_svm_range_dev(svm, sdev, start, align >> VTD_PAGE_SHIFT, ih); + start += align; + } +} + static void intel_flush_svm_range(struct intel_svm *svm, unsigned long address, unsigned long pages, int ih) { From b812834b5329fe78d643c9a61350d227db904361 Mon Sep 17 00:00:00 2001 From: Konrad Dybcio Date: Sat, 9 Jan 2021 17:56:21 +0100 Subject: [PATCH 062/241] iommu: arm-smmu-qcom: Add sdm630/msm8998 compatibles for qcom quirks SDM630 and MSM8998 are among the SoCs that use Qualcomm's implementation of SMMUv2 which has already proven to be problematic over the years. Add their compatibles to the lookup list to prevent the platforms from being shut down by the hypervisor at MMU probe. Signed-off-by: Konrad Dybcio Signed-off-by: AngeloGioacchino Del Regno Link: https://lore.kernel.org/r/20210109165622.149777-1-konrad.dybcio@somainline.org Signed-off-by: Will Deacon --- drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c index 1b83d140742f..bcda17012aee 100644 --- a/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c +++ b/drivers/iommu/arm/arm-smmu/arm-smmu-qcom.c @@ -325,7 +325,9 @@ static struct arm_smmu_device *qcom_smmu_create(struct arm_smmu_device *smmu, } static const struct of_device_id __maybe_unused qcom_smmu_impl_of_match[] = { + { .compatible = "qcom,msm8998-smmu-v2" }, { .compatible = "qcom,sc7180-smmu-500" }, + { .compatible = "qcom,sdm630-smmu-v2" }, { .compatible = "qcom,sdm845-smmu-500" }, { .compatible = "qcom,sm8150-smmu-500" }, { .compatible = "qcom,sm8250-smmu-500" }, From 694a1c0adebee9152a9ba0320468f7921aca647d Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 28 Dec 2020 09:26:14 +0800 Subject: [PATCH 063/241] iommu/vt-d: Fix duplicate included linux/dma-map-ops.h linux/dma-map-ops.h is included more than once, Remove the one that isn't necessary. Signed-off-by: Tian Tao Acked-by: Lu Baolu Link: https://lore.kernel.org/r/1609118774-10083-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Will Deacon --- drivers/iommu/intel/iommu.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/iommu/intel/iommu.c b/drivers/iommu/intel/iommu.c index 65cf06d70bf4..f665322a0991 100644 --- a/drivers/iommu/intel/iommu.c +++ b/drivers/iommu/intel/iommu.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include From ffaf97899c4a58b9fefb11534f730785443611a8 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 11 Jan 2021 22:52:18 +0000 Subject: [PATCH 064/241] drm/i915/gt: Limit VFE threads based on GT MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the range [0, n-1] where n is #EU * (#threads/EU) with the number of threads based on plaform and the number of EU based on the number of slices and subslices. This is a fixed number per platform/gt, so appropriately limit the number of threads we spawn to match the device. v2: Oversaturate the system with tasks to force execution on every HW thread; if the thread idles it is returned to the pool and may be reused again before an unused thread. v3: Fix more state commands, which was causing Baytrail to barf. v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024 Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Prathap Kumar Valsan Cc: Akeem G Abodunrin Cc: Jon Bloomfield Cc: Rodrigo Vivi Cc: Randy Wright Cc: stable@vger.kernel.org # v5.7+ Reviewed-by: Akeem G Abodunrin Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20210111225220.3483-1-chris@chris-wilson.co.uk (cherry picked from commit eebfb32e26851662d24ea86dd381fd0f83cd4b47) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/gen7_renderclear.c | 157 ++++++++++++--------- 1 file changed, 94 insertions(+), 63 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen7_renderclear.c b/drivers/gpu/drm/i915/gt/gen7_renderclear.c index d93d85cd3027..94465374ca2f 100644 --- a/drivers/gpu/drm/i915/gt/gen7_renderclear.c +++ b/drivers/gpu/drm/i915/gt/gen7_renderclear.c @@ -7,8 +7,6 @@ #include "i915_drv.h" #include "intel_gpu_commands.h" -#define MAX_URB_ENTRIES 64 -#define STATE_SIZE (4 * 1024) #define GT3_INLINE_DATA_DELAYS 0x1E00 #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS)) @@ -34,38 +32,59 @@ struct batch_chunk { }; struct batch_vals { - u32 max_primitives; - u32 max_urb_entries; - u32 cmd_size; - u32 state_size; + u32 max_threads; u32 state_start; - u32 batch_size; + u32 surface_start; u32 surface_height; u32 surface_width; - u32 scratch_size; - u32 max_size; + u32 size; }; +static inline int num_primitives(const struct batch_vals *bv) +{ + /* + * We need to saturate the GPU with work in order to dispatch + * a shader on every HW thread, and clear the thread-local registers. + * In short, we have to dispatch work faster than the shaders can + * run in order to fill the EU and occupy each HW thread. + */ + return bv->max_threads; +} + static void batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv) { if (IS_HASWELL(i915)) { - bv->max_primitives = 280; - bv->max_urb_entries = MAX_URB_ENTRIES; + switch (INTEL_INFO(i915)->gt) { + default: + case 1: + bv->max_threads = 70; + break; + case 2: + bv->max_threads = 140; + break; + case 3: + bv->max_threads = 280; + break; + } bv->surface_height = 16 * 16; bv->surface_width = 32 * 2 * 16; } else { - bv->max_primitives = 128; - bv->max_urb_entries = MAX_URB_ENTRIES / 2; + switch (INTEL_INFO(i915)->gt) { + default: + case 1: /* including vlv */ + bv->max_threads = 36; + break; + case 2: + bv->max_threads = 128; + break; + } bv->surface_height = 16 * 8; bv->surface_width = 32 * 16; } - bv->cmd_size = bv->max_primitives * 4096; - bv->state_size = STATE_SIZE; - bv->state_start = bv->cmd_size; - bv->batch_size = bv->cmd_size + bv->state_size; - bv->scratch_size = bv->surface_height * bv->surface_width; - bv->max_size = bv->batch_size + bv->scratch_size; + bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K); + bv->surface_start = bv->state_start + SZ_4K; + bv->size = bv->surface_start + bv->surface_height * bv->surface_width; } static void batch_init(struct batch_chunk *bc, @@ -155,7 +174,8 @@ static u32 gen7_fill_binding_table(struct batch_chunk *state, const struct batch_vals *bv) { - u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv); + u32 surface_start = + gen7_fill_surface_state(state, bv->surface_start, bv); u32 *cs = batch_alloc_items(state, 32, 8); u32 offset = batch_offset(state, cs); @@ -214,9 +234,9 @@ static void gen7_emit_state_base_address(struct batch_chunk *batch, u32 surface_state_base) { - u32 *cs = batch_alloc_items(batch, 0, 12); + u32 *cs = batch_alloc_items(batch, 0, 10); - *cs++ = STATE_BASE_ADDRESS | (12 - 2); + *cs++ = STATE_BASE_ADDRESS | (10 - 2); /* general */ *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY; /* surface */ @@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch, *cs++ = BASE_ADDRESS_MODIFY; *cs++ = 0; *cs++ = BASE_ADDRESS_MODIFY; - *cs++ = 0; - *cs++ = 0; batch_advance(batch, cs); } @@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch, u32 urb_size, u32 curbe_size, u32 mode) { - u32 urb_entries = bv->max_urb_entries; - u32 threads = bv->max_primitives - 1; + u32 threads = bv->max_threads - 1; u32 *cs = batch_alloc_items(batch, 32, 8); *cs++ = MEDIA_VFE_STATE | (8 - 2); @@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch, *cs++ = 0; /* number of threads & urb entries for GPGPU vs Media Mode */ - *cs++ = threads << 16 | urb_entries << 8 | mode << 2; + *cs++ = threads << 16 | 1 << 8 | mode << 2; *cs++ = 0; @@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch, { unsigned int x_offset = (media_object_index % 16) * 64; unsigned int y_offset = (media_object_index / 16) * 16; - unsigned int inline_data_size; - unsigned int media_batch_size; - unsigned int i; + unsigned int pkt = 6 + 3; u32 *cs; - inline_data_size = 112 * 8; - media_batch_size = inline_data_size + 6; + cs = batch_alloc_items(batch, 8, pkt); - cs = batch_alloc_items(batch, 8, media_batch_size); - - *cs++ = MEDIA_OBJECT | (media_batch_size - 2); + *cs++ = MEDIA_OBJECT | (pkt - 2); /* interface descriptor offset */ *cs++ = 0; @@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch, *cs++ = 0; /* inline */ - *cs++ = (y_offset << 16) | (x_offset); + *cs++ = y_offset << 16 | x_offset; *cs++ = 0; *cs++ = GT3_INLINE_DATA_DELAYS; - for (i = 3; i < inline_data_size; i++) - *cs++ = 0; batch_advance(batch, cs); } static void gen7_emit_pipeline_flush(struct batch_chunk *batch) { - u32 *cs = batch_alloc_items(batch, 0, 5); + u32 *cs = batch_alloc_items(batch, 0, 4); - *cs++ = GFX_OP_PIPE_CONTROL(5); - *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE | - PIPE_CONTROL_GLOBAL_GTT_IVB; + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH | + PIPE_CONTROL_DEPTH_CACHE_FLUSH | + PIPE_CONTROL_DC_FLUSH_ENABLE | + PIPE_CONTROL_CS_STALL; *cs++ = 0; *cs++ = 0; + + batch_advance(batch, cs); +} + +static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch) +{ + u32 *cs = batch_alloc_items(batch, 0, 8); + + /* ivb: Stall before STATE_CACHE_INVALIDATE */ + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD | + PIPE_CONTROL_CS_STALL; *cs++ = 0; + *cs++ = 0; + + *cs++ = GFX_OP_PIPE_CONTROL(4); + *cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE; + *cs++ = 0; + *cs++ = 0; + batch_advance(batch, cs); } @@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma, const struct batch_vals *bv) { struct drm_i915_private *i915 = vma->vm->i915; - unsigned int desc_count = 64; - const u32 urb_size = 112; + const unsigned int desc_count = 1; + const unsigned int urb_size = 1; struct batch_chunk cmds, state; - u32 interface_descriptor; + u32 descriptors; unsigned int i; - batch_init(&cmds, vma, start, 0, bv->cmd_size); - batch_init(&state, vma, start, bv->state_start, bv->state_size); + batch_init(&cmds, vma, start, 0, bv->state_start); + batch_init(&state, vma, start, bv->state_start, SZ_4K); - interface_descriptor = - gen7_fill_interface_descriptor(&state, bv, - IS_HASWELL(i915) ? - &cb_kernel_hsw : - &cb_kernel_ivb, - desc_count); - gen7_emit_pipeline_flush(&cmds); + descriptors = gen7_fill_interface_descriptor(&state, bv, + IS_HASWELL(i915) ? + &cb_kernel_hsw : + &cb_kernel_ivb, + desc_count); + + gen7_emit_pipeline_invalidate(&cmds); batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA); batch_add(&cmds, MI_NOOP); - gen7_emit_state_base_address(&cmds, interface_descriptor); + gen7_emit_pipeline_invalidate(&cmds); + gen7_emit_pipeline_flush(&cmds); + gen7_emit_state_base_address(&cmds, descriptors); + gen7_emit_pipeline_invalidate(&cmds); gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0); + gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count); - gen7_emit_interface_descriptor_load(&cmds, - interface_descriptor, - desc_count); - - for (i = 0; i < bv->max_primitives; i++) + for (i = 0; i < num_primitives(bv); i++) gen7_emit_media_object(&cmds, i); batch_add(&cmds, MI_BATCH_BUFFER_END); @@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine, batch_get_defaults(engine->i915, &bv); if (!vma) - return bv.max_size; + return bv.size; - GEM_BUG_ON(vma->obj->base.size < bv.max_size); + GEM_BUG_ON(vma->obj->base.size < bv.size); batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC); if (IS_ERR(batch)) return PTR_ERR(batch); - emit_batch(vma, memset(batch, 0, bv.max_size), &bv); + emit_batch(vma, memset(batch, 0, bv.size), &bv); i915_gem_object_flush_map(vma->obj); __i915_gem_object_release_map(vma->obj); From 09aa9e45863e9e25dfbf350bae89fc3c2964482c Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 11 Jan 2021 22:52:19 +0000 Subject: [PATCH 065/241] drm/i915/gt: Restore clear-residual mitigations for Ivybridge, Baytrail The mitigation is required for all gen7 platforms, now that it does not cause GPU hangs, restore it for Ivybridge and Baytrail. Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts") Signed-off-by: Chris Wilson Cc: Mika Kuoppala Cc: Prathap Kumar Valsan Cc: Akeem G Abodunrin Cc: Bloomfield Jon Reviewed-by: Akeem G Abodunrin Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20210111225220.3483-2-chris@chris-wilson.co.uk (cherry picked from commit 008ead6ef8f588a8c832adfe9db201d9be5fd410) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/gt/intel_ring_submission.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index a41b43f445b8..d809789f9cfb 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -1290,7 +1290,7 @@ int intel_ring_submission_setup(struct intel_engine_cs *engine) GEM_BUG_ON(timeline->hwsp_ggtt != engine->status_page.vma); - if (IS_HASWELL(engine->i915) && engine->class == RENDER_CLASS) { + if (IS_GEN(engine->i915, 7) && engine->class == RENDER_CLASS) { err = gen7_ctx_switch_bb_init(engine); if (err) goto err_ring_unpin; From 984cadea032b103c5824a5f29d0a36b3e9df6333 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Mon, 11 Jan 2021 22:52:20 +0000 Subject: [PATCH 066/241] drm/i915: Allow the sysadmin to override security mitigations The clear-residuals mitigation is a relatively heavy hammer and under some circumstances the user may wish to forgo the context isolation in order to meet some performance requirement. Introduce a generic module parameter to allow selectively enabling/disabling different mitigations. To disable just the clear-residuals mitigation (on Ivybridge, Baytrail, or Haswell) use the module parameter: i915.mitigations=auto,!residuals Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/1858 Fixes: 47f8253d2b89 ("drm/i915/gen7: Clear all EU/L3 residual contexts") Signed-off-by: Chris Wilson Cc: Joonas Lahtinen Cc: Jon Bloomfield Cc: Rodrigo Vivi Cc: stable@vger.kernel.org # v5.7 Reviewed-by: Jon Bloomfield Reviewed-by: Rodrigo Vivi Link: https://patchwork.freedesktop.org/patch/msgid/20210111225220.3483-3-chris@chris-wilson.co.uk (cherry picked from commit f7452c7cbd5b5dfb9a6c84cb20bea04c89be50cd) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/Makefile | 1 + .../gpu/drm/i915/gt/intel_ring_submission.c | 4 +- drivers/gpu/drm/i915/i915_mitigations.c | 146 ++++++++++++++++++ drivers/gpu/drm/i915/i915_mitigations.h | 13 ++ 4 files changed, 163 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/i915/i915_mitigations.c create mode 100644 drivers/gpu/drm/i915/i915_mitigations.h diff --git a/drivers/gpu/drm/i915/Makefile b/drivers/gpu/drm/i915/Makefile index e5574e506a5c..6d9e81ea67f4 100644 --- a/drivers/gpu/drm/i915/Makefile +++ b/drivers/gpu/drm/i915/Makefile @@ -38,6 +38,7 @@ i915-y += i915_drv.o \ i915_config.o \ i915_irq.o \ i915_getparam.o \ + i915_mitigations.o \ i915_params.o \ i915_pci.o \ i915_scatterlist.o \ diff --git a/drivers/gpu/drm/i915/gt/intel_ring_submission.c b/drivers/gpu/drm/i915/gt/intel_ring_submission.c index d809789f9cfb..ecf3a6118a6d 100644 --- a/drivers/gpu/drm/i915/gt/intel_ring_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_ring_submission.c @@ -32,6 +32,7 @@ #include "gen6_ppgtt.h" #include "gen7_renderclear.h" #include "i915_drv.h" +#include "i915_mitigations.h" #include "intel_breadcrumbs.h" #include "intel_context.h" #include "intel_gt.h" @@ -886,7 +887,8 @@ static int switch_context(struct i915_request *rq) GEM_BUG_ON(HAS_EXECLISTS(engine->i915)); if (engine->wa_ctx.vma && ce != engine->kernel_context) { - if (engine->wa_ctx.vma->private != ce) { + if (engine->wa_ctx.vma->private != ce && + i915_mitigate_clear_residuals()) { ret = clear_residuals(rq); if (ret) return ret; diff --git a/drivers/gpu/drm/i915/i915_mitigations.c b/drivers/gpu/drm/i915/i915_mitigations.c new file mode 100644 index 000000000000..84f12598d145 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_mitigations.c @@ -0,0 +1,146 @@ +// SPDX-License-Identifier: MIT +/* + * Copyright © 2021 Intel Corporation + */ + +#include +#include +#include +#include + +#include "i915_drv.h" +#include "i915_mitigations.h" + +static unsigned long mitigations __read_mostly = ~0UL; + +enum { + CLEAR_RESIDUALS = 0, +}; + +static const char * const names[] = { + [CLEAR_RESIDUALS] = "residuals", +}; + +bool i915_mitigate_clear_residuals(void) +{ + return READ_ONCE(mitigations) & BIT(CLEAR_RESIDUALS); +} + +static int mitigations_set(const char *val, const struct kernel_param *kp) +{ + unsigned long new = ~0UL; + char *str, *sep, *tok; + bool first = true; + int err = 0; + + BUILD_BUG_ON(ARRAY_SIZE(names) >= BITS_PER_TYPE(mitigations)); + + str = kstrdup(val, GFP_KERNEL); + if (!str) + return -ENOMEM; + + for (sep = str; (tok = strsep(&sep, ","));) { + bool enable = true; + int i; + + /* Be tolerant of leading/trailing whitespace */ + tok = strim(tok); + + if (first) { + first = false; + + if (!strcmp(tok, "auto")) + continue; + + new = 0; + if (!strcmp(tok, "off")) + continue; + } + + if (*tok == '!') { + enable = !enable; + tok++; + } + + if (!strncmp(tok, "no", 2)) { + enable = !enable; + tok += 2; + } + + if (*tok == '\0') + continue; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + if (!strcmp(tok, names[i])) { + if (enable) + new |= BIT(i); + else + new &= ~BIT(i); + break; + } + } + if (i == ARRAY_SIZE(names)) { + pr_err("Bad \"%s.mitigations=%s\", '%s' is unknown\n", + DRIVER_NAME, val, tok); + err = -EINVAL; + break; + } + } + kfree(str); + if (err) + return err; + + WRITE_ONCE(mitigations, new); + return 0; +} + +static int mitigations_get(char *buffer, const struct kernel_param *kp) +{ + unsigned long local = READ_ONCE(mitigations); + int count, i; + bool enable; + + if (!local) + return scnprintf(buffer, PAGE_SIZE, "%s\n", "off"); + + if (local & BIT(BITS_PER_LONG - 1)) { + count = scnprintf(buffer, PAGE_SIZE, "%s,", "auto"); + enable = false; + } else { + enable = true; + count = 0; + } + + for (i = 0; i < ARRAY_SIZE(names); i++) { + if ((local & BIT(i)) != enable) + continue; + + count += scnprintf(buffer + count, PAGE_SIZE - count, + "%s%s,", enable ? "" : "!", names[i]); + } + + buffer[count - 1] = '\n'; + return count; +} + +static const struct kernel_param_ops ops = { + .set = mitigations_set, + .get = mitigations_get, +}; + +module_param_cb_unsafe(mitigations, &ops, NULL, 0600); +MODULE_PARM_DESC(mitigations, +"Selectively enable security mitigations for all Intel® GPUs in the system.\n" +"\n" +" auto -- enables all mitigations required for the platform [default]\n" +" off -- disables all mitigations\n" +"\n" +"Individual mitigations can be enabled by passing a comma-separated string,\n" +"e.g. mitigations=residuals to enable only clearing residuals or\n" +"mitigations=auto,noresiduals to disable only the clear residual mitigation.\n" +"Either '!' or 'no' may be used to switch from enabling the mitigation to\n" +"disabling it.\n" +"\n" +"Active mitigations for Ivybridge, Baytrail, Haswell:\n" +" residuals -- clear all thread-local registers between contexts" +); diff --git a/drivers/gpu/drm/i915/i915_mitigations.h b/drivers/gpu/drm/i915/i915_mitigations.h new file mode 100644 index 000000000000..1359d8135287 --- /dev/null +++ b/drivers/gpu/drm/i915/i915_mitigations.h @@ -0,0 +1,13 @@ +/* SPDX-License-Identifier: MIT */ +/* + * Copyright © 2021 Intel Corporation + */ + +#ifndef __I915_MITIGATIONS_H__ +#define __I915_MITIGATIONS_H__ + +#include + +bool i915_mitigate_clear_residuals(void); + +#endif /* __I915_MITIGATIONS_H__ */ From d78050ee35440d7879ed94011c52994b8932e96e Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 7 Jan 2021 14:40:08 +0000 Subject: [PATCH 067/241] arm64: Remove arm64_dma32_phys_limit and its uses With the introduction of a dynamic ZONE_DMA range based on DT or IORT information, there's no need for CMA allocations from the wider ZONE_DMA32 since on most platforms ZONE_DMA will cover the 32-bit addressable range. Remove the arm64_dma32_phys_limit and set arm64_dma_phys_limit to cover the smallest DMA range required on the platform. CMA allocation and crashkernel reservation now go in the dynamically sized ZONE_DMA, allowing correct functionality on RPi4. Signed-off-by: Catalin Marinas Cc: Chen Zhou Reviewed-by: Nicolas Saenz Julienne Tested-by: Nicolas Saenz Julienne # On RPi4B --- arch/arm64/include/asm/processor.h | 3 +-- arch/arm64/mm/init.c | 33 ++++++++++++++++-------------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 69ad25fbeae4..ca2cd75d3286 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -94,8 +94,7 @@ #endif /* CONFIG_ARM64_FORCE_52BIT */ extern phys_addr_t arm64_dma_phys_limit; -extern phys_addr_t arm64_dma32_phys_limit; -#define ARCH_LOW_ADDRESS_LIMIT ((arm64_dma_phys_limit ? : arm64_dma32_phys_limit) - 1) +#define ARCH_LOW_ADDRESS_LIMIT (arm64_dma_phys_limit - 1) struct debug_info { #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 7deddf56f7c3..709d98fea90c 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -53,13 +53,13 @@ s64 memstart_addr __ro_after_init = -1; EXPORT_SYMBOL(memstart_addr); /* - * We create both ZONE_DMA and ZONE_DMA32. ZONE_DMA covers the first 1G of - * memory as some devices, namely the Raspberry Pi 4, have peripherals with - * this limited view of the memory. ZONE_DMA32 will cover the rest of the 32 - * bit addressable memory area. + * If the corresponding config options are enabled, we create both ZONE_DMA + * and ZONE_DMA32. By default ZONE_DMA covers the 32-bit addressable memory + * unless restricted on specific platforms (e.g. 30-bit on Raspberry Pi 4). + * In such case, ZONE_DMA32 covers the rest of the 32-bit addressable memory, + * otherwise it is empty. */ phys_addr_t arm64_dma_phys_limit __ro_after_init; -phys_addr_t arm64_dma32_phys_limit __ro_after_init; #ifdef CONFIG_KEXEC_CORE /* @@ -84,7 +84,7 @@ static void __init reserve_crashkernel(void) if (crash_base == 0) { /* Current arm64 boot protocol requires 2MB alignment */ - crash_base = memblock_find_in_range(0, arm64_dma32_phys_limit, + crash_base = memblock_find_in_range(0, arm64_dma_phys_limit, crash_size, SZ_2M); if (crash_base == 0) { pr_warn("cannot allocate crashkernel (size:0x%llx)\n", @@ -196,6 +196,7 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; unsigned int __maybe_unused acpi_zone_dma_bits; unsigned int __maybe_unused dt_zone_dma_bits; + phys_addr_t __maybe_unused dma32_phys_limit = max_zone_phys(32); #ifdef CONFIG_ZONE_DMA acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); @@ -205,8 +206,12 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max) max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 - max_zone_pfns[ZONE_DMA32] = PFN_DOWN(arm64_dma32_phys_limit); + max_zone_pfns[ZONE_DMA32] = PFN_DOWN(dma32_phys_limit); + if (!arm64_dma_phys_limit) + arm64_dma_phys_limit = dma32_phys_limit; #endif + if (!arm64_dma_phys_limit) + arm64_dma_phys_limit = PHYS_MASK + 1; max_zone_pfns[ZONE_NORMAL] = max; free_area_init(max_zone_pfns); @@ -394,16 +399,9 @@ void __init arm64_memblock_init(void) early_init_fdt_scan_reserved_mem(); - if (IS_ENABLED(CONFIG_ZONE_DMA32)) - arm64_dma32_phys_limit = max_zone_phys(32); - else - arm64_dma32_phys_limit = PHYS_MASK + 1; - reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; - - dma_contiguous_reserve(arm64_dma32_phys_limit); } void __init bootmem_init(void) @@ -438,6 +436,11 @@ void __init bootmem_init(void) sparse_init(); zone_sizes_init(min, max); + /* + * Reserve the CMA area after arm64_dma_phys_limit was initialised. + */ + dma_contiguous_reserve(arm64_dma_phys_limit); + /* * request_standard_resources() depends on crashkernel's memory being * reserved, so do it here. @@ -455,7 +458,7 @@ void __init bootmem_init(void) void __init mem_init(void) { if (swiotlb_force == SWIOTLB_FORCE || - max_pfn > PFN_DOWN(arm64_dma_phys_limit ? : arm64_dma32_phys_limit)) + max_pfn > PFN_DOWN(arm64_dma_phys_limit)) swiotlb_init(1); else swiotlb_force = SWIOTLB_NO_FORCE; From 8e14f610159d524cd7aac37982826d3ef75c09e8 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Sat, 9 Jan 2021 15:17:06 +0000 Subject: [PATCH 068/241] dm crypt: do not call bio_endio() from the dm-crypt tasklet Sometimes, when dm-crypt executes decryption in a tasklet, we may get "BUG: KASAN: use-after-free in tasklet_action_common.constprop..." with a kasan-enabled kernel. When the decryption fully completes in the tasklet, dm-crypt will call bio_endio(), which in turn will call clone_endio() from dm.c core code. That function frees the resources associated with the bio, including per bio private structures. For dm-crypt it will free the current struct dm_crypt_io, which contains our tasklet object, causing use-after-free, when the tasklet is being dequeued by the kernel. To avoid this, do not call bio_endio() from the current tasklet context, but delay its execution to the dm-crypt IO workqueue. Fixes: 39d42fa96ba1 ("dm crypt: add flags to optionally bypass kcryptd workqueues") Cc: # v5.9+ Signed-off-by: Ignat Korchagin Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index d2d6d3000f5b..b4d7418b5036 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1730,6 +1730,12 @@ static void crypt_inc_pending(struct dm_crypt_io *io) atomic_inc(&io->io_pending); } +static void kcryptd_io_bio_endio(struct work_struct *work) +{ + struct dm_crypt_io *io = container_of(work, struct dm_crypt_io, work); + bio_endio(io->base_bio); +} + /* * One of the bios was finished. Check for completion of * the whole request and correctly clean up the buffer. @@ -1752,7 +1758,23 @@ static void crypt_dec_pending(struct dm_crypt_io *io) kfree(io->integrity_metadata); base_bio->bi_status = error; - bio_endio(base_bio); + + /* + * If we are running this function from our tasklet, + * we can't call bio_endio() here, because it will call + * clone_endio() from dm.c, which in turn will + * free the current struct dm_crypt_io structure with + * our tasklet. In this case we need to delay bio_endio() + * execution to after the tasklet is done and dequeued. + */ + if (tasklet_trylock(&io->tasklet)) { + tasklet_unlock(&io->tasklet); + bio_endio(base_bio); + return; + } + + INIT_WORK(&io->work, kcryptd_io_bio_endio); + queue_work(cc->io_queue, &io->work); } /* From 4be34f3d0731b38a1b24566b37fbb39500aaf3a2 Mon Sep 17 00:00:00 2001 From: Stanislav Fomichev Date: Tue, 12 Jan 2021 08:28:29 -0800 Subject: [PATCH 069/241] bpf: Don't leak memory in bpf getsockopt when optlen == 0 optlen == 0 indicates that the kernel should ignore BPF buffer and use the original one from the user. We, however, forget to free the temporary buffer that we've allocated for BPF. Fixes: d8fe449a9c51 ("bpf: Don't return EINVAL from {get,set}sockopt when optlen > PAGE_SIZE") Reported-by: Martin KaFai Lau Signed-off-by: Stanislav Fomichev Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210112162829.775079-1-sdf@google.com --- kernel/bpf/cgroup.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c index 6ec088a96302..96555a8a2c54 100644 --- a/kernel/bpf/cgroup.c +++ b/kernel/bpf/cgroup.c @@ -1391,12 +1391,13 @@ int __cgroup_bpf_run_filter_setsockopt(struct sock *sk, int *level, if (ctx.optlen != 0) { *optlen = ctx.optlen; *kernel_optval = ctx.optval; + /* export and don't free sockopt buf */ + return 0; } } out: - if (ret) - sockopt_free_buf(&ctx); + sockopt_free_buf(&ctx); return ret; } From bcc5e6162d66d44f7929f30fce032f95855fc8b4 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 9 Jan 2021 23:03:40 -0800 Subject: [PATCH 070/241] bpf: Allow empty module BTFs Some modules don't declare any new types and end up with an empty BTF, containing only valid BTF header and no types or strings sections. This currently causes BTF validation error. There is nothing wrong with such BTF, so fix the issue by allowing module BTFs with no types or strings. Fixes: 36e68442d1af ("bpf: Load and verify kernel module BTFs") Reported-by: Christopher William Snowhill Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210110070341.1380086-1-andrii@kernel.org --- kernel/bpf/btf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c index 8d6bdb4f4d61..84a36ee4a4c2 100644 --- a/kernel/bpf/btf.c +++ b/kernel/bpf/btf.c @@ -4172,7 +4172,7 @@ static int btf_parse_hdr(struct btf_verifier_env *env) return -ENOTSUPP; } - if (btf_data_size == hdr->hdr_len) { + if (!btf->base_btf && btf_data_size == hdr->hdr_len) { btf_verifier_log(env, "No data"); return -EINVAL; } From b8d52264df85ec12f370c0a8b28d0ac59a05877a Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Sat, 9 Jan 2021 23:03:41 -0800 Subject: [PATCH 071/241] libbpf: Allow loading empty BTFs Empty BTFs do come up (e.g., simple kernel modules with no new types and strings, compared to the vmlinux BTF) and there is nothing technically wrong with them. So remove unnecessary check preventing loading empty BTFs. Fixes: d8123624506c ("libbpf: Fix BTF data layout checks and allow empty BTF") Reported-by: Christopher William Snowhill Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210110070341.1380086-2-andrii@kernel.org --- tools/lib/bpf/btf.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/lib/bpf/btf.c b/tools/lib/bpf/btf.c index 3c3f2bc6c652..9970a288dda5 100644 --- a/tools/lib/bpf/btf.c +++ b/tools/lib/bpf/btf.c @@ -240,11 +240,6 @@ static int btf_parse_hdr(struct btf *btf) } meta_left = btf->raw_size - sizeof(*hdr); - if (!meta_left) { - pr_debug("BTF has no data\n"); - return -EINVAL; - } - if (meta_left < hdr->str_off + hdr->str_len) { pr_debug("Invalid BTF total size:%u\n", btf->raw_size); return -EINVAL; From 17ffc193cdc6dc7a613d00d8ad47fc1f801b9bf0 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Tue, 12 Jan 2021 14:54:47 -0500 Subject: [PATCH 072/241] dm integrity: fix the maximum number of arguments Advance the maximum number of arguments from 9 to 15 to account for all potential feature flags that may be supplied. Linux 4.19 added "meta_device" (356d9d52e1221ba0c9f10b8b38652f78a5298329) and "recalculate" (a3fcf7253139609bf9ff901fbf955fba047e75dd) flags. Commit 468dfca38b1a6fbdccd195d875599cb7c8875cd9 added "sectors_per_bit" and "bitmap_flush_interval". Commit 84597a44a9d86ac949900441cea7da0af0f2f473 added "allow_discards". And the commit d537858ac8aaf4311b51240893add2fc62003b97 added "fix_padding". Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org # v4.19+ Signed-off-by: Mike Snitzer --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index 11c7c538f7a9..81df019ab284 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -3792,7 +3792,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned argc, char **argv) unsigned extra_args; struct dm_arg_set as; static const struct dm_arg _args[] = { - {0, 9, "Invalid number of feature args"}, + {0, 15, "Invalid number of feature args"}, }; unsigned journal_sectors, interleave_sectors, buffer_sectors, journal_watermark, sync_msec; bool should_write_sb; From 8ff60eb052eeba95cfb3efe16b08c9199f8121cf Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 12 Jan 2021 15:49:04 -0800 Subject: [PATCH 073/241] mm, slub: consider rest of partial list if acquire_slab() fails acquire_slab() fails if there is contention on the freelist of the page (probably because some other CPU is concurrently freeing an object from the page). In that case, it might make sense to look for a different page (since there might be more remote frees to the page from other CPUs, and we don't want contention on struct page). However, the current code accidentally stops looking at the partial list completely in that case. Especially on kernels without CONFIG_NUMA set, this means that get_partial() fails and new_slab_objects() falls back to new_slab(), allocating new pages. This could lead to an unnecessary increase in memory fragmentation. Link: https://lkml.kernel.org/r/20201228130853.1871516-1-jannh@google.com Fixes: 7ced37197196 ("slub: Acquire_slab() avoid loop") Signed-off-by: Jann Horn Acked-by: David Rientjes Acked-by: Joonsoo Kim Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index dc5b42e700b8..d9e4e10683cc 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1973,7 +1973,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, t = acquire_slab(s, n, page, object == NULL, &objects); if (!t) - break; + continue; /* cmpxchg raced */ available += objects; if (!object) { From ce8f86ee94fabcc98537ddccd7e82cfd360a4dc5 Mon Sep 17 00:00:00 2001 From: Hailong liu Date: Tue, 12 Jan 2021 15:49:08 -0800 Subject: [PATCH 074/241] mm/page_alloc: add a missing mm_page_alloc_zone_locked() tracepoint The trace point *trace_mm_page_alloc_zone_locked()* in __rmqueue() does not currently cover all branches. Add the missing tracepoint and check the page before do that. [akpm@linux-foundation.org: use IS_ENABLED() to suppress warning] Link: https://lkml.kernel.org/r/20201228132901.41523-1-carver4lio@163.com Signed-off-by: Hailong liu Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bdbec4c98173..027f6481ba59 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2862,20 +2862,20 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype, { struct page *page; -#ifdef CONFIG_CMA - /* - * Balance movable allocations between regular and CMA areas by - * allocating from CMA when over half of the zone's free memory - * is in the CMA area. - */ - if (alloc_flags & ALLOC_CMA && - zone_page_state(zone, NR_FREE_CMA_PAGES) > - zone_page_state(zone, NR_FREE_PAGES) / 2) { - page = __rmqueue_cma_fallback(zone, order); - if (page) - return page; + if (IS_ENABLED(CONFIG_CMA)) { + /* + * Balance movable allocations between regular and CMA areas by + * allocating from CMA when over half of the zone's free memory + * is in the CMA area. + */ + if (alloc_flags & ALLOC_CMA && + zone_page_state(zone, NR_FREE_CMA_PAGES) > + zone_page_state(zone, NR_FREE_PAGES) / 2) { + page = __rmqueue_cma_fallback(zone, order); + if (page) + goto out; + } } -#endif retry: page = __rmqueue_smallest(zone, order, migratetype); if (unlikely(!page)) { @@ -2886,8 +2886,9 @@ retry: alloc_flags)) goto retry; } - - trace_mm_page_alloc_zone_locked(page, order, migratetype); +out: + if (page) + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } From 7ea510b92c7c9b4eb5ff72e6b4bbad4b0407a914 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 12 Jan 2021 15:49:11 -0800 Subject: [PATCH 075/241] mm/memcontrol: fix warning in mem_cgroup_page_lruvec() Boot a CONFIG_MEMCG=y kernel with "cgroup_disabled=memory" and you are met by a series of warnings from the VM_WARN_ON_ONCE_PAGE(!memcg, page) recently added to the inline mem_cgroup_page_lruvec(). An earlier attempt to place that warning, in mem_cgroup_lruvec(), had been careful to do so after weeding out the mem_cgroup_disabled() case; but was itself invalid because of the mem_cgroup_lruvec(NULL, pgdat) in clear_pgdat_congested() and age_active_anon(). Warning in mem_cgroup_page_lruvec() was once useful in detecting a KSM charge bug, so may be worth keeping: but skip if mem_cgroup_disabled(). Link: https://lkml.kernel.org/r/alpine.LSU.2.11.2101032056260.1093@eggly.anvils Fixes: 9a1ac2288cf1 ("mm/memcontrol:rewrite mem_cgroup_page_lruvec()") Signed-off-by: Hugh Dickins Reviewed-by: Alex Shi Acked-by: Roman Gushchin Acked-by: Chris Down Reviewed-by: Baoquan He Acked-by: Vlastimil Babka Cc: Hui Su Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Johannes Weiner Cc: Shakeel Butt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d827bd7f3bfe..eeb0b52203e9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -665,7 +665,7 @@ static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, { struct mem_cgroup *memcg = page_memcg(page); - VM_WARN_ON_ONCE_PAGE(!memcg, page); + VM_WARN_ON_ONCE_PAGE(!memcg && !mem_cgroup_disabled(), page); return mem_cgroup_lruvec(memcg, pgdat); } From 29970dc24faf0078beb4efab5455b4f504d2198d Mon Sep 17 00:00:00 2001 From: Hailong Liu Date: Tue, 12 Jan 2021 15:49:14 -0800 Subject: [PATCH 076/241] arm/kasan: fix the array size of kasan_early_shadow_pte[] MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The size of kasan_early_shadow_pte[] now is PTRS_PER_PTE which defined to 512 for arm. This means that it only covers the prev Linux pte entries, but not the HWTABLE pte entries for arm. The reason it currently works is that the symbol kasan_early_shadow_page immediately following kasan_early_shadow_pte in memory is page aligned, which makes kasan_early_shadow_pte look like a 4KB size array. But we can't ensure the order is always right with different compiler/linker, or if more bss symbols are introduced. We had a test with QEMU + vexpress:put a 512KB-size symbol with attribute __section(".bss..page_aligned") after kasan_early_shadow_pte, and poisoned it after kasan_early_init(). Then enabled CONFIG_KASAN, it failed to boot up. Link: https://lkml.kernel.org/r/20210109044622.8312-1-hailongliiu@yeah.net Signed-off-by: Hailong Liu Signed-off-by: Ziliang Guo Reviewed-by: Linus Walleij Cc: Andrey Ryabinin Cc: Russell King Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kasan.h | 6 +++++- mm/kasan/init.c | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 5e0655fb2a6f..fe1ae73ff8b5 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -35,8 +35,12 @@ struct kunit_kasan_expectation { #define KASAN_SHADOW_INIT 0 #endif +#ifndef PTE_HWTABLE_PTRS +#define PTE_HWTABLE_PTRS 0 +#endif + extern unsigned char kasan_early_shadow_page[PAGE_SIZE]; -extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE]; +extern pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS]; extern pmd_t kasan_early_shadow_pmd[PTRS_PER_PMD]; extern pud_t kasan_early_shadow_pud[PTRS_PER_PUD]; extern p4d_t kasan_early_shadow_p4d[MAX_PTRS_PER_P4D]; diff --git a/mm/kasan/init.c b/mm/kasan/init.c index bc0ad208b3a7..7ca0b92d5886 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -64,7 +64,8 @@ static inline bool kasan_pmd_table(pud_t pud) return false; } #endif -pte_t kasan_early_shadow_pte[PTRS_PER_PTE] __page_aligned_bss; +pte_t kasan_early_shadow_pte[PTRS_PER_PTE + PTE_HWTABLE_PTRS] + __page_aligned_bss; static inline bool kasan_pte_table(pmd_t pmd) { From c22ee5284cf58017fa8c6d21d8f8c68159b6faab Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Jan 2021 15:49:18 -0800 Subject: [PATCH 077/241] mm/vmalloc.c: fix potential memory leak In VM_MAP_PUT_PAGES case, we should put pages and free array in vfree. But we missed to set area->nr_pages in vmap(). So we would fail to put pages in __vunmap() because area->nr_pages = 0. Link: https://lkml.kernel.org/r/20210107123541.39206-1-linmiaohe@huawei.com Fixes: b944afc9d64d ("mm: add a VM_MAP_PUT_PAGES flag for vmap") Signed-off-by: Shijie Luo Signed-off-by: Miaohe Lin Reviewed-by: Uladzislau Rezki (Sony) Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4d88fe5a277a..e6f352bf0498 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2420,8 +2420,10 @@ void *vmap(struct page **pages, unsigned int count, return NULL; } - if (flags & VM_MAP_PUT_PAGES) + if (flags & VM_MAP_PUT_PAGES) { area->pages = pages; + area->nr_pages = count; + } return area->addr; } EXPORT_SYMBOL(vmap); From f555befd185dc097ede887eb7b308c2e1c1369d4 Mon Sep 17 00:00:00 2001 From: Jan Stancek Date: Tue, 12 Jan 2021 15:49:21 -0800 Subject: [PATCH 078/241] mm: migrate: initialize err in do_migrate_pages After commit 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}")', do_migrate_pages can return uninitialized variable 'err' (which is propagated to user-space as error) when 'from' and 'to' nodesets are identical. This can be reproduced with LTP migrate_pages01, which calls migrate_pages() with same set for both old/new_nodes. Add 'err' initialization back. Link: https://lkml.kernel.org/r/456a021c7ef3636d7668cec9dcb4a446a4244812.1609855564.git.jstancek@redhat.com Fixes: 236c32eb1096 ("mm: migrate: clean up migrate_prep{_local}") Signed-off-by: Jan Stancek Acked-by: Michal Hocko Acked-by: Yang Shi Cc: Zi Yan Cc: Jan Kara Cc: Matthew Wilcox Cc: Mel Gorman Cc: Song Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8cf96bd21341..2c3a86502053 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1111,7 +1111,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { int busy = 0; - int err; + int err = 0; nodemask_t tmp; migrate_prep(); From 0eb98f1588c2cc7a79816d84ab18a55d254f481c Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 12 Jan 2021 15:49:24 -0800 Subject: [PATCH 079/241] mm/hugetlb: fix potential missing huge page size info The huge page size is encoded for VM_FAULT_HWPOISON errors only. So if we return VM_FAULT_HWPOISON, huge page size would just be ignored. Link: https://lkml.kernel.org/r/20210107123449.38481-1-linmiaohe@huawei.com Fixes: aa50d3a7aa81 ("Encode huge page size for VM_FAULT_HWPOISON errors") Signed-off-by: Miaohe Lin Reviewed-by: Mike Kravetz Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a2602969873d..18f6ee317900 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4371,7 +4371,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } From 7e5f1126b54a29c078c07a5fe245e269f3c05500 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 12 Jan 2021 15:49:27 -0800 Subject: [PATCH 080/241] MAINTAINERS: add Vlastimil as slab allocators maintainer I would like to help with slab allocators maintenance, from the perspective of being responsible for SLAB and more recently also SLUB in an enterprise distro kernel and supporting its users. Recently I've been focusing on improving SLUB's debugging features, and patch review in the area, including the kmemcg accounting rewrite last year. Link: https://lkml.kernel.org/r/20210108110353.19971-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Christoph Lameter Acked-by: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index cc1e6a5ee6e6..28cbe0ec6610 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16319,6 +16319,7 @@ M: Pekka Enberg M: David Rientjes M: Joonsoo Kim M: Andrew Morton +M: Vlastimil Babka L: linux-mm@kvack.org S: Maintained F: include/linux/sl?b*.h From 6696d2a6f38c0beedf03c381edfc392ecf7631b4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 12 Jan 2021 15:49:30 -0800 Subject: [PATCH 081/241] mm,hwpoison: fix printing of page flags Format %pG expects a lower case 'p' in order to print the flags. Fix it. Link: https://lkml.kernel.org/r/20210108085202.4506-1-osalvador@suse.de Fixes: 8295d535e2aa ("mm,hwpoison: refactor get_any_page") Signed-off-by: Oscar Salvador Reported-by: Dan Carpenter Reviewed-by: Anshuman Khandual Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5a38e9eade94..04d9f154a130 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1940,7 +1940,7 @@ retry: goto retry; } } else if (ret == -EIO) { - pr_info("%s: %#lx: unknown page type: %lx (%pGP)\n", + pr_info("%s: %#lx: unknown page type: %lx (%pGp)\n", __func__, pfn, page->flags, &page->flags); } From eb351d75ce1e75b4f793d609efac08426ca50acd Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 12 Jan 2021 15:49:33 -0800 Subject: [PATCH 082/241] mm/process_vm_access.c: include compat.h Fix the build error: mm/process_vm_access.c:277:5: error: implicit declaration of function 'in_compat_syscall'; did you mean 'in_ia32_syscall'? [-Werror=implicit-function-declaration] Fixes: 38dc5079da7081e "Fix compat regression in process_vm_rw()" Reported-by: syzbot+5b0d0de84d6c65b8dd2b@syzkaller.appspotmail.com Cc: Kyle Huey Cc: Jens Axboe Cc: Al Viro Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/process_vm_access.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index 4bcc11958089..f5fee9cf90f8 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include From 7cd1af107a92eb63b93a96dc07406dcbc5269436 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Fri, 18 Dec 2020 16:20:51 -0800 Subject: [PATCH 083/241] riscv: Trace irq on only interrupt is enabled We should call irq trace only if interrupt is going to be enabled during excecption handling. Otherwise, it results in following warning during boot with lock debugging enabled. [ 0.000000] ------------[ cut here ]------------ [ 0.000000] DEBUG_LOCKS_WARN_ON(early_boot_irqs_disabled) [ 0.000000] WARNING: CPU: 0 PID: 0 at kernel/locking/lockdep.c:4085 lockdep_hardirqs_on_prepare+0x22a/0x22e [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.10.0-00022-ge20097fb37e2-dirty #548 [ 0.000000] epc: c005d5d4 ra : c005d5d4 sp : c1c01e80 [ 0.000000] gp : c1d456e0 tp : c1c0a980 t0 : 00000000 [ 0.000000] t1 : ffffffff t2 : 00000000 s0 : c1c01ea0 [ 0.000000] s1 : c100f360 a0 : 0000002d a1 : c00666ee [ 0.000000] a2 : 00000000 a3 : 00000000 a4 : 00000000 [ 0.000000] a5 : 00000000 a6 : c1c6b390 a7 : 3ffff00e [ 0.000000] s2 : c2384fe8 s3 : 00000000 s4 : 00000001 [ 0.000000] s5 : c1c0a980 s6 : c1d48000 s7 : c1613b4c [ 0.000000] s8 : 00000fff s9 : 80000200 s10: c1613b40 [ 0.000000] s11: 00000000 t3 : 00000000 t4 : 00000000 [ 0.000000] t5 : 00000001 t6 : 00000000 Fixes: 3c4697982982 ("riscv:Enable LOCKDEP_SUPPORT & fixup TRACE_IRQFLAGS_SUPPORT") Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/entry.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/entry.S b/arch/riscv/kernel/entry.S index 48de316c68c1..744f3209c48d 100644 --- a/arch/riscv/kernel/entry.S +++ b/arch/riscv/kernel/entry.S @@ -124,15 +124,15 @@ skip_context_tracking: REG_L a1, (a1) jr a1 1: -#ifdef CONFIG_TRACE_IRQFLAGS - call trace_hardirqs_on -#endif /* * Exceptions run with interrupts enabled or disabled depending on the * state of SR_PIE in m/sstatus. */ andi t0, s1, SR_PIE beqz t0, 1f +#ifdef CONFIG_TRACE_IRQFLAGS + call trace_hardirqs_on +#endif csrs CSR_STATUS, SR_IE 1: From 80709af7325d179b433817f421c85449f2454046 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 23 Dec 2020 00:01:52 +0800 Subject: [PATCH 084/241] riscv: cacheinfo: Fix using smp_processor_id() in preemptible Use raw_smp_processor_id instead of smp_processor_id() to fix warning, BUG: using smp_processor_id() in preemptible [00000000] code: init/1 caller is debug_smp_processor_id+0x1c/0x26 CPU: 0 PID: 1 Comm: init Not tainted 5.10.0-rc4 #211 Call Trace: walk_stackframe+0x0/0xaa show_stack+0x32/0x3e dump_stack+0x76/0x90 check_preemption_disabled+0xaa/0xac debug_smp_processor_id+0x1c/0x26 get_cache_size+0x18/0x68 load_elf_binary+0x868/0xece bprm_execve+0x224/0x498 kernel_execve+0xdc/0x142 run_init_process+0x90/0x9e try_to_run_init_process+0x12/0x3c kernel_init+0xb4/0xf8 ret_from_exception+0x0/0xc The issue is found when CONFIG_DEBUG_PREEMPT enabled. Reviewed-by: Atish Patra Tested-by: Atish Patra Signed-off-by: Kefeng Wang [Palmer: Added a comment.] Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cacheinfo.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/cacheinfo.c b/arch/riscv/kernel/cacheinfo.c index de59dd457b41..d86781357044 100644 --- a/arch/riscv/kernel/cacheinfo.c +++ b/arch/riscv/kernel/cacheinfo.c @@ -26,7 +26,16 @@ cache_get_priv_group(struct cacheinfo *this_leaf) static struct cacheinfo *get_cacheinfo(u32 level, enum cache_type type) { - struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(smp_processor_id()); + /* + * Using raw_smp_processor_id() elides a preemptability check, but this + * is really indicative of a larger problem: the cacheinfo UABI assumes + * that cores have a homonogenous view of the cache hierarchy. That + * happens to be the case for the current set of RISC-V systems, but + * likely won't be true in general. Since there's no way to provide + * correct information for these systems via the current UABI we're + * just eliding the check for now. + */ + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(raw_smp_processor_id()); struct cacheinfo *this_leaf; int index; From 0aa2ec8a475fb505fd98d93bbcf4e03beeeebcb6 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sat, 2 Jan 2021 13:24:34 +0000 Subject: [PATCH 085/241] riscv: Fixup CONFIG_GENERIC_TIME_VSYSCALL The patch fix commit: ad5d112 ("riscv: use vDSO common flow to reduce the latency of the time-related functions"). The GENERIC_TIME_VSYSCALL should be CONFIG_GENERIC_TIME_VSYSCALL or vgettimeofday won't work. Signed-off-by: Guo Ren Reviewed-by: Pekka Enberg Fixes: ad5d1122b82f ("riscv: use vDSO common flow to reduce the latency of the time-related functions") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/vdso.h | 2 +- arch/riscv/kernel/vdso.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index 8454f746bbfd..1453a2f563bc 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -10,7 +10,7 @@ #include -#ifndef GENERIC_TIME_VSYSCALL +#ifndef CONFIG_GENERIC_TIME_VSYSCALL struct vdso_data { }; #endif diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 678204231700..3f1d35e7c98a 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -12,7 +12,7 @@ #include #include #include -#ifdef GENERIC_TIME_VSYSCALL +#ifdef CONFIG_GENERIC_TIME_VSYSCALL #include #else #include From 69e976831cd53f9ba304fd20305b2025ecc78eab Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 10 Jan 2021 14:21:05 +0000 Subject: [PATCH 086/241] MIPS: relocatable: fix possible boot hangup with KASLR enabled LLVM-built Linux triggered a boot hangup with KASLR enabled. arch/mips/kernel/relocate.c:get_random_boot() uses linux_banner, which is a string constant, as a random seed, but accesses it as an array of unsigned long (in rotate_xor()). When the address of linux_banner is not aligned to sizeof(long), such access emits unaligned access exception and hangs the kernel. Use PTR_ALIGN() to align input address to sizeof(long) and also align down the input length to prevent possible access-beyond-end. Fixes: 405bc8fd12f5 ("MIPS: Kernel: Implement KASLR using CONFIG_RELOCATABLE") Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Alexander Lobakin Tested-by: Nathan Chancellor Reviewed-by: Kees Cook Signed-off-by: Thomas Bogendoerfer --- arch/mips/kernel/relocate.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/relocate.c b/arch/mips/kernel/relocate.c index 47aeb3350a76..0e365b7c742d 100644 --- a/arch/mips/kernel/relocate.c +++ b/arch/mips/kernel/relocate.c @@ -187,8 +187,14 @@ static int __init relocate_exception_table(long offset) static inline __init unsigned long rotate_xor(unsigned long hash, const void *area, size_t size) { - size_t i; - unsigned long *ptr = (unsigned long *)area; + const typeof(hash) *ptr = PTR_ALIGN(area, sizeof(hash)); + size_t diff, i; + + diff = (void *)ptr - area; + if (unlikely(size < diff + sizeof(hash))) + return hash; + + size = ALIGN_DOWN(size - diff, sizeof(hash)); for (i = 0; i < size / sizeof(hash); i++) { /* Rotate by odd number of bits and XOR. */ From 7b490a8ab0f2d3ab8d838a4ff22ae86edafd34a1 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Mon, 11 Jan 2021 05:27:25 -0800 Subject: [PATCH 087/241] MIPS: OCTEON: fix unreachable code in octeon_irq_init_ciu The type of 'r' in octeon_irq_init_ciu is 'unsigned int', so 'r < 0' can't be true. Fix this by change the type of 'r' and 'i' from 'unsigned int' to 'int'. As 'i' won't be negative, this change works. Fixes: 99fbc70f8547 ("MIPS: Octeon: irq: Alloc desc before configuring IRQ") Signed-off-by: Menglong Dong Reviewed-by: Alexander Sverdlin Signed-off-by: Thomas Bogendoerfer --- arch/mips/cavium-octeon/octeon-irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/cavium-octeon/octeon-irq.c b/arch/mips/cavium-octeon/octeon-irq.c index bd47e15d02c7..be5d4afcd30f 100644 --- a/arch/mips/cavium-octeon/octeon-irq.c +++ b/arch/mips/cavium-octeon/octeon-irq.c @@ -1444,7 +1444,7 @@ static void octeon_irq_setup_secondary_ciu2(void) static int __init octeon_irq_init_ciu( struct device_node *ciu_node, struct device_node *parent) { - unsigned int i, r; + int i, r; struct irq_chip *chip; struct irq_chip *chip_edge; struct irq_chip *chip_mbox; From ef3a575baf53571dc405ee4028e26f50856898e7 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 12 Jan 2021 12:53:58 +0100 Subject: [PATCH 088/241] xen/privcmd: allow fetching resource sizes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Allow issuing an IOCTL_PRIVCMD_MMAP_RESOURCE ioctl with num = 0 and addr = 0 in order to fetch the size of a specific resource. Add a shortcut to the default map resource path, since fetching the size requires no address to be passed in, and thus no VMA to setup. This is missing from the initial implementation, and causes issues when mapping resources that don't have fixed or known sizes. Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Tested-by: Andrew Cooper Cc: stable@vger.kernel.org # >= 4.18 Link: https://lore.kernel.org/r/20210112115358.23346-1-roger.pau@citrix.com Signed-off-by: Juergen Gross --- drivers/xen/privcmd.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index b0c73c58f987..720a7b7abd46 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -717,14 +717,15 @@ static long privcmd_ioctl_restrict(struct file *file, void __user *udata) return 0; } -static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) +static long privcmd_ioctl_mmap_resource(struct file *file, + struct privcmd_mmap_resource __user *udata) { struct privcmd_data *data = file->private_data; struct mm_struct *mm = current->mm; struct vm_area_struct *vma; struct privcmd_mmap_resource kdata; xen_pfn_t *pfns = NULL; - struct xen_mem_acquire_resource xdata; + struct xen_mem_acquire_resource xdata = { }; int rc; if (copy_from_user(&kdata, udata, sizeof(kdata))) @@ -734,6 +735,22 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) if (data->domid != DOMID_INVALID && data->domid != kdata.dom) return -EPERM; + /* Both fields must be set or unset */ + if (!!kdata.addr != !!kdata.num) + return -EINVAL; + + xdata.domid = kdata.dom; + xdata.type = kdata.type; + xdata.id = kdata.id; + + if (!kdata.addr && !kdata.num) { + /* Query the size of the resource. */ + rc = HYPERVISOR_memory_op(XENMEM_acquire_resource, &xdata); + if (rc) + return rc; + return __put_user(xdata.nr_frames, &udata->num); + } + mmap_write_lock(mm); vma = find_vma(mm, kdata.addr); @@ -768,10 +785,6 @@ static long privcmd_ioctl_mmap_resource(struct file *file, void __user *udata) } else vma->vm_private_data = PRIV_VMA_LOCKED; - memset(&xdata, 0, sizeof(xdata)); - xdata.domid = kdata.dom; - xdata.type = kdata.type; - xdata.id = kdata.id; xdata.frame = kdata.idx; xdata.nr_frames = kdata.num; set_xen_guest_handle(xdata.frame_list, pfns); From df06824767cc9a32fbdb0e3d3b7e169292a5b5fe Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 7 Jan 2021 14:53:10 +0000 Subject: [PATCH 089/241] arm64: entry: remove redundant IRQ flag tracing All EL0 returns go via ret_to_user(), which masks IRQs and notifies lockdep and tracing before calling into do_notify_resume(). Therefore, there's no need for do_notify_resume() to call trace_hardirqs_off(), and the comment is stale. The call is simply redundant. In ret_to_user() we call exit_to_user_mode(), which notifies lockdep and tracing the IRQs will be enabled in userspace, so there's no need for el0_svc_common() to call trace_hardirqs_on() before returning. Further, at the start of ret_to_user() we call trace_hardirqs_off(), so not only is this redundant, but it is immediately undone. In addition to being redundant, the trace_hardirqs_on() in el0_svc_common() leaves lockdep inconsistent with the hardware state, and is liable to cause issues for any C code or instrumentation between this and the call to trace_hardirqs_off() which undoes it in ret_to_user(). This patch removes the redundant tracing calls and associated stale comments. Fixes: 23529049c684 ("arm64: entry: fix non-NMI user<->kernel transitions") Signed-off-by: Mark Rutland Acked-by: Will Deacon Cc: James Morse Cc: Will Deacon Link: https://lore.kernel.org/r/20210107145310.44616-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/signal.c | 7 ------- arch/arm64/kernel/syscall.c | 9 +-------- 2 files changed, 1 insertion(+), 15 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index f71d6ce4673f..6237486ff6bb 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -914,13 +914,6 @@ static void do_signal(struct pt_regs *regs) asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { - /* - * The assembly code enters us with IRQs off, but it hasn't - * informed the tracing code of that for efficiency reasons. - * Update the trace code with the current status. - */ - trace_hardirqs_off(); - do { if (thread_flags & _TIF_NEED_RESCHED) { /* Unmask Debug and SError for the next task */ diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index f61e9d8cc55a..0bfac95fe464 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -165,15 +165,8 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, if (!has_syscall_work(flags) && !IS_ENABLED(CONFIG_DEBUG_RSEQ)) { local_daif_mask(); flags = current_thread_info()->flags; - if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) { - /* - * We're off to userspace, where interrupts are - * always enabled after we restore the flags from - * the SPSR. - */ - trace_hardirqs_on(); + if (!has_syscall_work(flags) && !(flags & _TIF_SINGLESTEP)) return; - } local_daif_restore(DAIF_PROCCTX); } From 5f39d2713bd80e8a3e6d9299930aec8844872c0e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sun, 3 Jan 2021 14:39:27 -0500 Subject: [PATCH 090/241] SUNRPC: Move the svc_xdr_recvfrom tracepoint again Commit 156708adf2d9 ("SUNRPC: Move the svc_xdr_recvfrom() tracepoint") tried to capture the correct XID in the trace record, but this line in svc_recv: rqstp->rq_xid = svc_getu32(&rqstp->rq_arg.head[0]); alters the size of rq_arg.head[0].iov_len. The tracepoint records the correct XID but an incorrect value for the length of the xdr_buf's head. To keep the trace callsites simple, I've created two trace classes. One assumes the xdr_buf contains a full RPC message, and the XID can be extracted from it. The other assumes the contents of the xdr_buf are arbitrary, and the xid will be provided by the caller. Currently there is only one user of each class, but I expect we will need a few more tracepoints using each class as time goes on. Signed-off-by: Chuck Lever --- include/trace/events/sunrpc.h | 61 +++++++++++++++++++++++++++++++---- net/sunrpc/svc_xprt.c | 4 +-- 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 58994e013022..6f89c27265f5 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -1424,13 +1424,13 @@ TRACE_EVENT(rpcb_unregister, ) ); -DECLARE_EVENT_CLASS(svc_xdr_buf_class, +/* Record an xdr_buf containing a fully-formed RPC message */ +DECLARE_EVENT_CLASS(svc_xdr_msg_class, TP_PROTO( - const struct svc_rqst *rqst, const struct xdr_buf *xdr ), - TP_ARGS(rqst, xdr), + TP_ARGS(xdr), TP_STRUCT__entry( __field(u32, xid) @@ -1443,7 +1443,55 @@ DECLARE_EVENT_CLASS(svc_xdr_buf_class, ), TP_fast_assign( - __entry->xid = be32_to_cpu(rqst->rq_xid); + __be32 *p = (__be32 *)xdr->head[0].iov_base; + + __entry->xid = be32_to_cpu(*p); + __entry->head_base = p; + __entry->head_len = xdr->head[0].iov_len; + __entry->tail_base = xdr->tail[0].iov_base; + __entry->tail_len = xdr->tail[0].iov_len; + __entry->page_len = xdr->page_len; + __entry->msg_len = xdr->len; + ), + + TP_printk("xid=0x%08x head=[%p,%zu] page=%u tail=[%p,%zu] len=%u", + __entry->xid, + __entry->head_base, __entry->head_len, __entry->page_len, + __entry->tail_base, __entry->tail_len, __entry->msg_len + ) +); + +#define DEFINE_SVCXDRMSG_EVENT(name) \ + DEFINE_EVENT(svc_xdr_msg_class, \ + svc_xdr_##name, \ + TP_PROTO( \ + const struct xdr_buf *xdr \ + ), \ + TP_ARGS(xdr)) + +DEFINE_SVCXDRMSG_EVENT(recvfrom); + +/* Record an xdr_buf containing arbitrary data, tagged with an XID */ +DECLARE_EVENT_CLASS(svc_xdr_buf_class, + TP_PROTO( + __be32 xid, + const struct xdr_buf *xdr + ), + + TP_ARGS(xid, xdr), + + TP_STRUCT__entry( + __field(u32, xid) + __field(const void *, head_base) + __field(size_t, head_len) + __field(const void *, tail_base) + __field(size_t, tail_len) + __field(unsigned int, page_len) + __field(unsigned int, msg_len) + ), + + TP_fast_assign( + __entry->xid = be32_to_cpu(xid); __entry->head_base = xdr->head[0].iov_base; __entry->head_len = xdr->head[0].iov_len; __entry->tail_base = xdr->tail[0].iov_base; @@ -1463,12 +1511,11 @@ DECLARE_EVENT_CLASS(svc_xdr_buf_class, DEFINE_EVENT(svc_xdr_buf_class, \ svc_xdr_##name, \ TP_PROTO( \ - const struct svc_rqst *rqst, \ + __be32 xid, \ const struct xdr_buf *xdr \ ), \ - TP_ARGS(rqst, xdr)) + TP_ARGS(xid, xdr)) -DEFINE_SVCXDRBUF_EVENT(recvfrom); DEFINE_SVCXDRBUF_EVENT(sendto); /* diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5fb9164aa690..dcc50ae54550 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -857,6 +857,7 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) err = -EAGAIN; if (len <= 0) goto out_release; + trace_svc_xdr_recvfrom(&rqstp->rq_arg); clear_bit(XPT_OLD, &xprt->xpt_flags); @@ -866,7 +867,6 @@ int svc_recv(struct svc_rqst *rqstp, long timeout) if (serv->sv_stats) serv->sv_stats->netcnt++; - trace_svc_xdr_recvfrom(rqstp, &rqstp->rq_arg); return len; out_release: rqstp->rq_res.len = 0; @@ -904,7 +904,7 @@ int svc_send(struct svc_rqst *rqstp) xb->len = xb->head[0].iov_len + xb->page_len + xb->tail[0].iov_len; - trace_svc_xdr_sendto(rqstp, xb); + trace_svc_xdr_sendto(rqstp->rq_xid, xb); trace_svc_stats_latency(rqstp); len = xprt->xpt_ops->xpo_sendto(rqstp); From b90d72a6bfdb5e5c62cd223a8cdf4045bfbcb94d Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Jan 2021 22:18:55 +0000 Subject: [PATCH 091/241] Revert "arm64: Enable perf events based hard lockup detector" This reverts commit 367c820ef08082e68df8a3bc12e62393af21e4b5. lockup_detector_init() makes heavy use of per-cpu variables and must be called with preemption disabled. Usually, it's handled early during boot in kernel_init_freeable(), before SMP has been initialised. Since we do not know whether or not our PMU interrupt can be signalled as an NMI until considerably later in the boot process, the Arm PMU driver attempts to re-initialise the lockup detector off the back of a device_initcall(). Unfortunately, this is called from preemptible context and results in the following splat: | BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1 | caller is debug_smp_processor_id+0x20/0x2c | CPU: 2 PID: 1 Comm: swapper/0 Not tainted 5.10.0+ #276 | Hardware name: linux,dummy-virt (DT) | Call trace: | dump_backtrace+0x0/0x3c0 | show_stack+0x20/0x6c | dump_stack+0x2f0/0x42c | check_preemption_disabled+0x1cc/0x1dc | debug_smp_processor_id+0x20/0x2c | hardlockup_detector_event_create+0x34/0x18c | hardlockup_detector_perf_init+0x2c/0x134 | watchdog_nmi_probe+0x18/0x24 | lockup_detector_init+0x44/0xa8 | armv8_pmu_driver_init+0x54/0x78 | do_one_initcall+0x184/0x43c | kernel_init_freeable+0x368/0x380 | kernel_init+0x1c/0x1cc | ret_from_fork+0x10/0x30 Rather than bodge this with raw_smp_processor_id() or randomly disabling preemption, simply revert the culprit for now until we figure out how to do this properly. Reported-by: Lecopzer Chen Signed-off-by: Will Deacon Acked-by: Mark Rutland Cc: Sumit Garg Cc: Alexandru Elisei Link: https://lore.kernel.org/r/20201221162249.3119-1-lecopzer.chen@mediatek.com Link: https://lore.kernel.org/r/20210112221855.10666-1-will@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/Kconfig | 2 -- arch/arm64/kernel/perf_event.c | 41 ++-------------------------------- drivers/perf/arm_pmu.c | 5 ----- include/linux/perf/arm_pmu.h | 2 -- 4 files changed, 2 insertions(+), 48 deletions(-) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 05e17351e4f3..f39568b28ec1 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -174,8 +174,6 @@ config ARM64 select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS - select HAVE_PERF_EVENTS_NMI if ARM64_PSEUDO_NMI && HW_PERF_EVENTS - select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 38bb07eff872..3605f77ad4df 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -23,8 +23,6 @@ #include #include #include -#include -#include /* ARMv8 Cortex-A53 specific event types. */ #define ARMV8_A53_PERFCTR_PREF_LINEFILL 0xC2 @@ -1250,21 +1248,10 @@ static struct platform_driver armv8_pmu_driver = { static int __init armv8_pmu_driver_init(void) { - int ret; - if (acpi_disabled) - ret = platform_driver_register(&armv8_pmu_driver); + return platform_driver_register(&armv8_pmu_driver); else - ret = arm_pmu_acpi_probe(armv8_pmuv3_init); - - /* - * Try to re-initialize lockup detector after PMU init in - * case PMU events are triggered via NMIs. - */ - if (ret == 0 && arm_pmu_irq_is_nmi()) - lockup_detector_init(); - - return ret; + return arm_pmu_acpi_probe(armv8_pmuv3_init); } device_initcall(armv8_pmu_driver_init) @@ -1322,27 +1309,3 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time_zero = 1; userpg->cap_user_time_short = 1; } - -#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF -/* - * Safe maximum CPU frequency in case a particular platform doesn't implement - * cpufreq driver. Although, architecture doesn't put any restrictions on - * maximum frequency but 5 GHz seems to be safe maximum given the available - * Arm CPUs in the market which are clocked much less than 5 GHz. On the other - * hand, we can't make it much higher as it would lead to a large hard-lockup - * detection timeout on parts which are running slower (eg. 1GHz on - * Developerbox) and doesn't possess a cpufreq driver. - */ -#define SAFE_MAX_CPU_FREQ 5000000000UL // 5 GHz -u64 hw_nmi_get_sample_period(int watchdog_thresh) -{ - unsigned int cpu = smp_processor_id(); - unsigned long max_cpu_freq; - - max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; - if (!max_cpu_freq) - max_cpu_freq = SAFE_MAX_CPU_FREQ; - - return (u64)max_cpu_freq * watchdog_thresh; -} -#endif diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index 794a37d50853..cb2f55f450e4 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -726,11 +726,6 @@ static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu) return per_cpu(hw_events->irq, cpu); } -bool arm_pmu_irq_is_nmi(void) -{ - return has_nmi; -} - /* * PMU hardware loses all context when a CPU goes offline. * When a CPU is hotplugged back in, since some hardware registers are diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index bf7966776c55..505480217cf1 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -163,8 +163,6 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn); static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } #endif -bool arm_pmu_irq_is_nmi(void); - /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); struct arm_pmu *armpmu_alloc_atomic(void); From 71e70184f1d1314ad56e834d1befc07daa2af8e6 Mon Sep 17 00:00:00 2001 From: Jianlin Lv Date: Tue, 12 Jan 2021 09:58:13 +0800 Subject: [PATCH 092/241] arm64: rename S_FRAME_SIZE to PT_REGS_SIZE S_FRAME_SIZE is the size of the pt_regs structure, no longer the size of the kernel stack frame, the name is misleading. In keeping with arm32, rename S_FRAME_SIZE to PT_REGS_SIZE. Signed-off-by: Jianlin Lv Acked-by: Mark Rutland Link: https://lore.kernel.org/r/20210112015813.2340969-1-Jianlin.Lv@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/asm-offsets.c | 2 +- arch/arm64/kernel/entry-ftrace.S | 12 ++++++------ arch/arm64/kernel/entry.S | 14 +++++++------- arch/arm64/kernel/probes/kprobes_trampoline.S | 6 +++--- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index f42fd9e33981..301784463587 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -75,7 +75,7 @@ int main(void) DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); - DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); + DEFINE(PT_REGS_SIZE, sizeof(struct pt_regs)); BLANK(); #ifdef CONFIG_COMPAT DEFINE(COMPAT_SIGFRAME_REGS_OFFSET, offsetof(struct compat_sigframe, uc.uc_mcontext.arm_r0)); diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index a338f40e64d3..b3e4f9a088b1 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -35,7 +35,7 @@ */ .macro ftrace_regs_entry, allregs=0 /* Make room for pt_regs, plus a callee frame */ - sub sp, sp, #(S_FRAME_SIZE + 16) + sub sp, sp, #(PT_REGS_SIZE + 16) /* Save function arguments (and x9 for simplicity) */ stp x0, x1, [sp, #S_X0] @@ -61,15 +61,15 @@ .endif /* Save the callsite's SP and LR */ - add x10, sp, #(S_FRAME_SIZE + 16) + add x10, sp, #(PT_REGS_SIZE + 16) stp x9, x10, [sp, #S_LR] /* Save the PC after the ftrace callsite */ str x30, [sp, #S_PC] /* Create a frame record for the callsite above pt_regs */ - stp x29, x9, [sp, #S_FRAME_SIZE] - add x29, sp, #S_FRAME_SIZE + stp x29, x9, [sp, #PT_REGS_SIZE] + add x29, sp, #PT_REGS_SIZE /* Create our frame record within pt_regs. */ stp x29, x30, [sp, #S_STACKFRAME] @@ -120,7 +120,7 @@ ftrace_common_return: ldr x9, [sp, #S_PC] /* Restore the callsite's SP */ - add sp, sp, #S_FRAME_SIZE + 16 + add sp, sp, #PT_REGS_SIZE + 16 ret x9 SYM_CODE_END(ftrace_common) @@ -130,7 +130,7 @@ SYM_CODE_START(ftrace_graph_caller) ldr x0, [sp, #S_PC] sub x0, x0, #AARCH64_INSN_SIZE // ip (callsite's BL insn) add x1, sp, #S_LR // parent_ip (callsite's LR) - ldr x2, [sp, #S_FRAME_SIZE] // parent fp (callsite's FP) + ldr x2, [sp, #PT_REGS_SIZE] // parent fp (callsite's FP) bl prepare_ftrace_return b ftrace_common_return SYM_CODE_END(ftrace_graph_caller) diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index a8c3e7aaca74..c9bae73f2621 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -75,7 +75,7 @@ alternative_else_nop_endif .endif #endif - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE #ifdef CONFIG_VMAP_STACK /* * Test whether the SP has overflowed, without corrupting a GPR. @@ -96,7 +96,7 @@ alternative_else_nop_endif * userspace, and can clobber EL0 registers to free up GPRs. */ - /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + /* Stash the original SP (minus PT_REGS_SIZE) in tpidr_el0. */ msr tpidr_el0, x0 /* Recover the original x0 value and stash it in tpidrro_el0 */ @@ -253,7 +253,7 @@ alternative_else_nop_endif scs_load tsk, x20 .else - add x21, sp, #S_FRAME_SIZE + add x21, sp, #PT_REGS_SIZE get_current_task tsk .endif /* \el == 0 */ mrs x22, elr_el1 @@ -377,7 +377,7 @@ alternative_else_nop_endif ldp x26, x27, [sp, #16 * 13] ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] - add sp, sp, #S_FRAME_SIZE // restore sp + add sp, sp, #PT_REGS_SIZE // restore sp .if \el == 0 alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 @@ -580,12 +580,12 @@ __bad_stack: /* * Store the original GPRs to the new stack. The orginal SP (minus - * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + * PT_REGS_SIZE) was stashed in tpidr_el0 by kernel_ventry. */ - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE kernel_entry 1 mrs x0, tpidr_el0 - add x0, x0, #S_FRAME_SIZE + add x0, x0, #PT_REGS_SIZE str x0, [sp, #S_SP] /* Stash the regs for handle_bad_stack */ diff --git a/arch/arm64/kernel/probes/kprobes_trampoline.S b/arch/arm64/kernel/probes/kprobes_trampoline.S index 890ca72c5a51..288a84e253cc 100644 --- a/arch/arm64/kernel/probes/kprobes_trampoline.S +++ b/arch/arm64/kernel/probes/kprobes_trampoline.S @@ -25,7 +25,7 @@ stp x24, x25, [sp, #S_X24] stp x26, x27, [sp, #S_X26] stp x28, x29, [sp, #S_X28] - add x0, sp, #S_FRAME_SIZE + add x0, sp, #PT_REGS_SIZE stp lr, x0, [sp, #S_LR] /* * Construct a useful saved PSTATE @@ -62,7 +62,7 @@ .endm SYM_CODE_START(kretprobe_trampoline) - sub sp, sp, #S_FRAME_SIZE + sub sp, sp, #PT_REGS_SIZE save_all_base_regs @@ -76,7 +76,7 @@ SYM_CODE_START(kretprobe_trampoline) restore_all_base_regs - add sp, sp, #S_FRAME_SIZE + add sp, sp, #PT_REGS_SIZE ret SYM_CODE_END(kretprobe_trampoline) From c35a824c31834d947fb99b0c608c1b9f922b4ba0 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 8 Jan 2021 10:19:56 +0100 Subject: [PATCH 093/241] arm64: make atomic helpers __always_inline With UBSAN enabled and building with clang, there are occasionally warnings like WARNING: modpost: vmlinux.o(.text+0xc533ec): Section mismatch in reference from the function arch_atomic64_or() to the variable .init.data:numa_nodes_parsed The function arch_atomic64_or() references the variable __initdata numa_nodes_parsed. This is often because arch_atomic64_or lacks a __initdata annotation or the annotation of numa_nodes_parsed is wrong. for functions that end up not being inlined as intended but operating on __initdata variables. Mark these as __always_inline, along with the corresponding asm-generic wrappers. Signed-off-by: Arnd Bergmann Acked-by: Will Deacon Link: https://lore.kernel.org/r/20210108092024.4034860-1-arnd@kernel.org Signed-off-by: Catalin Marinas --- arch/arm64/include/asm/atomic.h | 10 +++++----- include/asm-generic/bitops/atomic.h | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/arm64/include/asm/atomic.h b/arch/arm64/include/asm/atomic.h index 015ddffaf6ca..b56a4b2bc248 100644 --- a/arch/arm64/include/asm/atomic.h +++ b/arch/arm64/include/asm/atomic.h @@ -17,7 +17,7 @@ #include #define ATOMIC_OP(op) \ -static inline void arch_##op(int i, atomic_t *v) \ +static __always_inline void arch_##op(int i, atomic_t *v) \ { \ __lse_ll_sc_body(op, i, v); \ } @@ -32,7 +32,7 @@ ATOMIC_OP(atomic_sub) #undef ATOMIC_OP #define ATOMIC_FETCH_OP(name, op) \ -static inline int arch_##op##name(int i, atomic_t *v) \ +static __always_inline int arch_##op##name(int i, atomic_t *v) \ { \ return __lse_ll_sc_body(op##name, i, v); \ } @@ -56,7 +56,7 @@ ATOMIC_FETCH_OPS(atomic_sub_return) #undef ATOMIC_FETCH_OPS #define ATOMIC64_OP(op) \ -static inline void arch_##op(long i, atomic64_t *v) \ +static __always_inline void arch_##op(long i, atomic64_t *v) \ { \ __lse_ll_sc_body(op, i, v); \ } @@ -71,7 +71,7 @@ ATOMIC64_OP(atomic64_sub) #undef ATOMIC64_OP #define ATOMIC64_FETCH_OP(name, op) \ -static inline long arch_##op##name(long i, atomic64_t *v) \ +static __always_inline long arch_##op##name(long i, atomic64_t *v) \ { \ return __lse_ll_sc_body(op##name, i, v); \ } @@ -94,7 +94,7 @@ ATOMIC64_FETCH_OPS(atomic64_sub_return) #undef ATOMIC64_FETCH_OP #undef ATOMIC64_FETCH_OPS -static inline long arch_atomic64_dec_if_positive(atomic64_t *v) +static __always_inline long arch_atomic64_dec_if_positive(atomic64_t *v) { return __lse_ll_sc_body(atomic64_dec_if_positive, v); } diff --git a/include/asm-generic/bitops/atomic.h b/include/asm-generic/bitops/atomic.h index dd90c9792909..0e7316a86240 100644 --- a/include/asm-generic/bitops/atomic.h +++ b/include/asm-generic/bitops/atomic.h @@ -11,19 +11,19 @@ * See Documentation/atomic_bitops.txt for details. */ -static inline void set_bit(unsigned int nr, volatile unsigned long *p) +static __always_inline void set_bit(unsigned int nr, volatile unsigned long *p) { p += BIT_WORD(nr); atomic_long_or(BIT_MASK(nr), (atomic_long_t *)p); } -static inline void clear_bit(unsigned int nr, volatile unsigned long *p) +static __always_inline void clear_bit(unsigned int nr, volatile unsigned long *p) { p += BIT_WORD(nr); atomic_long_andnot(BIT_MASK(nr), (atomic_long_t *)p); } -static inline void change_bit(unsigned int nr, volatile unsigned long *p) +static __always_inline void change_bit(unsigned int nr, volatile unsigned long *p) { p += BIT_WORD(nr); atomic_long_xor(BIT_MASK(nr), (atomic_long_t *)p); From 3499ba8198cad47b731792e5e56b9ec2a78a83a2 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 13 Jan 2021 13:26:02 +0000 Subject: [PATCH 094/241] xen: Fix event channel callback via INTX/GSI For a while, event channel notification via the PCI platform device has been broken, because we attempt to communicate with xenstore before we even have notifications working, with the xs_reset_watches() call in xs_init(). We tend to get away with this on Xen versions below 4.0 because we avoid calling xs_reset_watches() anyway, because xenstore might not cope with reading a non-existent key. And newer Xen *does* have the vector callback support, so we rarely fall back to INTX/GSI delivery. To fix it, clean up a bit of the mess of xs_init() and xenbus_probe() startup. Call xs_init() directly from xenbus_init() only in the !XS_HVM case, deferring it to be called from xenbus_probe() in the XS_HVM case instead. Then fix up the invocation of xenbus_probe() to happen either from its device_initcall if the callback is available early enough, or when the callback is finally set up. This means that the hack of calling xenbus_probe() from a workqueue after the first interrupt, or directly from the PCI platform device setup, is no longer needed. Signed-off-by: David Woodhouse Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210113132606.422794-2-dwmw2@infradead.org Signed-off-by: Juergen Gross --- arch/arm/xen/enlighten.c | 2 +- drivers/xen/events/events_base.c | 10 ---- drivers/xen/platform-pci.c | 1 - drivers/xen/xenbus/xenbus.h | 1 + drivers/xen/xenbus/xenbus_comms.c | 8 --- drivers/xen/xenbus/xenbus_probe.c | 81 +++++++++++++++++++++++++------ include/xen/xenbus.h | 2 +- 7 files changed, 70 insertions(+), 35 deletions(-) diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index 60e901cd0de6..5a957a9a0984 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -371,7 +371,7 @@ static int __init xen_guest_init(void) } gnttab_init(); if (!xen_initial_domain()) - xenbus_probe(NULL); + xenbus_probe(); /* * Making sure board specific code will not set up ops for diff --git a/drivers/xen/events/events_base.c b/drivers/xen/events/events_base.c index 6038c4c35db5..bbebe248b726 100644 --- a/drivers/xen/events/events_base.c +++ b/drivers/xen/events/events_base.c @@ -2010,16 +2010,6 @@ static struct irq_chip xen_percpu_chip __read_mostly = { .irq_ack = ack_dynirq, }; -int xen_set_callback_via(uint64_t via) -{ - struct xen_hvm_param a; - a.domid = DOMID_SELF; - a.index = HVM_PARAM_CALLBACK_IRQ; - a.value = via; - return HYPERVISOR_hvm_op(HVMOP_set_param, &a); -} -EXPORT_SYMBOL_GPL(xen_set_callback_via); - #ifdef CONFIG_XEN_PVHVM /* Vector callbacks are better than PCI interrupts to receive event * channel notifications because we can receive vector callbacks on any diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index dd911e1ff782..9db557b76511 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -149,7 +149,6 @@ static int platform_pci_probe(struct pci_dev *pdev, ret = gnttab_init(); if (ret) goto grant_out; - xenbus_probe(NULL); return 0; grant_out: gnttab_free_auto_xlat_frames(); diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 2a93b7c9c159..dc1537335414 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -115,6 +115,7 @@ int xenbus_probe_node(struct xen_bus_type *bus, const char *type, const char *nodename); int xenbus_probe_devices(struct xen_bus_type *bus); +void xenbus_probe(void); void xenbus_dev_changed(const char *node, struct xen_bus_type *bus); diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index eb5151fc8efa..e5fda0256feb 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -57,16 +57,8 @@ DEFINE_MUTEX(xs_response_mutex); static int xenbus_irq; static struct task_struct *xenbus_task; -static DECLARE_WORK(probe_work, xenbus_probe); - - static irqreturn_t wake_waiting(int irq, void *unused) { - if (unlikely(xenstored_ready == 0)) { - xenstored_ready = 1; - schedule_work(&probe_work); - } - wake_up(&xb_waitq); return IRQ_HANDLED; } diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 44634d970a5c..c8f0282bb649 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -683,29 +683,76 @@ void unregister_xenstore_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); -void xenbus_probe(struct work_struct *unused) +void xenbus_probe(void) { xenstored_ready = 1; + /* + * In the HVM case, xenbus_init() deferred its call to + * xs_init() in case callbacks were not operational yet. + * So do it now. + */ + if (xen_store_domain_type == XS_HVM) + xs_init(); + /* Notify others that xenstore is up */ blocking_notifier_call_chain(&xenstore_chain, 0, NULL); } -EXPORT_SYMBOL_GPL(xenbus_probe); + +/* + * Returns true when XenStore init must be deferred in order to + * allow the PCI platform device to be initialised, before we + * can actually have event channel interrupts working. + */ +static bool xs_hvm_defer_init_for_callback(void) +{ +#ifdef CONFIG_XEN_PVHVM + return xen_store_domain_type == XS_HVM && + !xen_have_vector_callback; +#else + return false; +#endif +} static int __init xenbus_probe_initcall(void) { - if (!xen_domain()) - return -ENODEV; + /* + * Probe XenBus here in the XS_PV case, and also XS_HVM unless we + * need to wait for the platform PCI device to come up. + */ + if (xen_store_domain_type == XS_PV || + (xen_store_domain_type == XS_HVM && + !xs_hvm_defer_init_for_callback())) + xenbus_probe(); - if (xen_initial_domain() || xen_hvm_domain()) - return 0; - - xenbus_probe(NULL); return 0; } - device_initcall(xenbus_probe_initcall); +int xen_set_callback_via(uint64_t via) +{ + struct xen_hvm_param a; + int ret; + + a.domid = DOMID_SELF; + a.index = HVM_PARAM_CALLBACK_IRQ; + a.value = via; + + ret = HYPERVISOR_hvm_op(HVMOP_set_param, &a); + if (ret) + return ret; + + /* + * If xenbus_probe_initcall() deferred the xenbus_probe() + * due to the callback not functioning yet, we can do it now. + */ + if (!xenstored_ready && xs_hvm_defer_init_for_callback()) + xenbus_probe(); + + return ret; +} +EXPORT_SYMBOL_GPL(xen_set_callback_via); + /* Set up event channel for xenstored which is run as a local process * (this is normally used only in dom0) */ @@ -818,11 +865,17 @@ static int __init xenbus_init(void) break; } - /* Initialize the interface to xenstore. */ - err = xs_init(); - if (err) { - pr_warn("Error initializing xenstore comms: %i\n", err); - goto out_error; + /* + * HVM domains may not have a functional callback yet. In that + * case let xs_init() be called from xenbus_probe(), which will + * get invoked at an appropriate time. + */ + if (xen_store_domain_type != XS_HVM) { + err = xs_init(); + if (err) { + pr_warn("Error initializing xenstore comms: %i\n", err); + goto out_error; + } } if ((xen_store_domain_type != XS_LOCAL) && diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 00c7235ae93e..2c43b0ef1e4d 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -192,7 +192,7 @@ void xs_suspend_cancel(void); struct work_struct; -void xenbus_probe(struct work_struct *); +void xenbus_probe(void); #define XENBUS_IS_ERR_READ(str) ({ \ if (!IS_ERR(str) && strlen(str) == 0) { \ From 8f4fd86aa5d6aa122619623910065d236592e37c Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 6 Jan 2021 15:39:55 +0000 Subject: [PATCH 095/241] xen: Set platform PCI device INTX affinity to CPU0 With INTX or GSI delivery, Xen uses the event channel structures of CPU0. If the interrupt gets handled by Linux on a different CPU, then no events are seen as pending. Rather than introducing locking to allow other CPUs to process CPU0's events, just ensure that the PCI interrupts happens only on CPU0. Signed-off-by: David Woodhouse Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210106153958.584169-3-dwmw2@infradead.org Signed-off-by: Juergen Gross --- drivers/xen/platform-pci.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 9db557b76511..18f0ed8b1f93 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -132,6 +132,13 @@ static int platform_pci_probe(struct pci_dev *pdev, dev_warn(&pdev->dev, "request_irq failed err=%d\n", ret); goto out; } + /* + * It doesn't strictly *have* to run on CPU0 but it sure + * as hell better process the event channel ports delivered + * to CPU0. + */ + irq_set_affinity(pdev->irq, cpumask_of(0)); + callback_via = get_callback_via(pdev); ret = xen_set_callback_via(callback_via); if (ret) { From b36b0fe96af13460278bf9b173beced1bd15f85d Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 6 Jan 2021 15:39:56 +0000 Subject: [PATCH 096/241] x86/xen: Add xen_no_vector_callback option to test PCI INTX delivery It's useful to be able to test non-vector event channel delivery, to make sure Linux will work properly on older Xen which doesn't have it. It's also useful for those working on Xen and Xen-compatible hypervisors, because there are guest kernels still in active use which use PCI INTX even when vector delivery is available. Signed-off-by: David Woodhouse Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210106153958.584169-4-dwmw2@infradead.org Signed-off-by: Juergen Gross --- Documentation/admin-guide/kernel-parameters.txt | 4 ++++ arch/x86/xen/enlighten_hvm.c | 11 ++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 44fde25bb221..035d27b6272c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5964,6 +5964,10 @@ This option is obsoleted by the "nopv" option, which has equivalent effect for XEN platform. + xen_no_vector_callback + [KNL,X86,XEN] Disable the vector callback for Xen + event channel interrupts. + xen_scrub_pages= [XEN] Boolean option to control scrubbing pages before giving them back to Xen, for use by other domains. Can be also changed at runtime diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index 9e87ab010c82..ec50b7423a4c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -188,6 +188,8 @@ static int xen_cpu_dead_hvm(unsigned int cpu) return 0; } +static bool no_vector_callback __initdata; + static void __init xen_hvm_guest_init(void) { if (xen_pv_domain()) @@ -207,7 +209,7 @@ static void __init xen_hvm_guest_init(void) xen_panic_handler_init(); - if (xen_feature(XENFEAT_hvm_callback_vector)) + if (!no_vector_callback && xen_feature(XENFEAT_hvm_callback_vector)) xen_have_vector_callback = 1; xen_hvm_smp_init(); @@ -233,6 +235,13 @@ static __init int xen_parse_nopv(char *arg) } early_param("xen_nopv", xen_parse_nopv); +static __init int xen_parse_no_vector_callback(char *arg) +{ + no_vector_callback = true; + return 0; +} +early_param("xen_no_vector_callback", xen_parse_no_vector_callback); + bool __init xen_hvm_need_lapic(void) { if (xen_pv_domain()) From 4621dc6a5bf1235249e92231db30c96dfd1a18b9 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 6 Jan 2021 15:39:57 +0000 Subject: [PATCH 097/241] x86/xen: Don't register Xen IPIs when they aren't going to be used In the case where xen_have_vector_callback is false, we still register the IPI vectors in xen_smp_intr_init() for the secondary CPUs even though they aren't going to be used. Stop doing that. Signed-off-by: David Woodhouse Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210106153958.584169-5-dwmw2@infradead.org Signed-off-by: Juergen Gross --- arch/x86/xen/enlighten_hvm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten_hvm.c b/arch/x86/xen/enlighten_hvm.c index ec50b7423a4c..e68ea5f4ad1c 100644 --- a/arch/x86/xen/enlighten_hvm.c +++ b/arch/x86/xen/enlighten_hvm.c @@ -164,10 +164,10 @@ static int xen_cpu_up_prepare_hvm(unsigned int cpu) else per_cpu(xen_vcpu_id, cpu) = cpu; rc = xen_vcpu_setup(cpu); - if (rc) + if (rc || !xen_have_vector_callback) return rc; - if (xen_have_vector_callback && xen_feature(XENFEAT_hvm_safe_pvclock)) + if (xen_feature(XENFEAT_hvm_safe_pvclock)) xen_setup_timer(cpu); rc = xen_smp_intr_init(cpu); From 3d7746bea92530e8695258a3cf3ddec7a135edd6 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Wed, 6 Jan 2021 15:39:58 +0000 Subject: [PATCH 098/241] x86/xen: Fix xen_hvm_smp_init() when vector callback not available MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Only the IPI-related functions in the smp_ops should be conditional on the vector callback being available. The rest should still happen: • xen_hvm_smp_prepare_boot_cpu() This function does two things, both of which should still happen if there is no vector callback support. The call to xen_vcpu_setup() for vCPU0 should still happen as it just sets up the vcpu_info for CPU0. That does happen for the secondary vCPUs too, from xen_cpu_up_prepare_hvm(). The second thing it does is call xen_init_spinlocks(), which perhaps counter-intuitively should *also* still be happening in the case without vector callbacks, so that it can clear its local xen_pvspin flag and disable the virt_spin_lock_key accordingly. Checking xen_have_vector_callback in xen_init_spinlocks() itself would affect PV guests, so set the global nopvspin flag in xen_hvm_smp_init() instead, when vector callbacks aren't available. • xen_hvm_smp_prepare_cpus() This does some IPI-related setup by calling xen_smp_intr_init() and xen_init_lock_cpu(), which can be made conditional. And it sets the xen_vcpu_id to XEN_VCPU_ID_INVALID for all possible CPUS, which does need to happen. • xen_smp_cpus_done() This offlines any vCPUs which doesn't fit in the global shared_info page, if separate vcpu_info placement isn't available. That part also needs to happen regardless of vector callback support. • xen_hvm_cpu_die() This doesn't actually do anything other than commin_cpu_die() right right now in the !vector_callback case; all three teardown functions it calls should be no-ops. But to guard against future regressions it's useful to call it anyway, and for it to explicitly check for xen_have_vector_callback before calling those additional functions. Signed-off-by: David Woodhouse Reviewed-by: Boris Ostrovsky Link: https://lore.kernel.org/r/20210106153958.584169-6-dwmw2@infradead.org Signed-off-by: Juergen Gross --- arch/x86/xen/smp_hvm.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index f5e7db4f82ab..056430a1080b 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -33,9 +33,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) int cpu; native_smp_prepare_cpus(max_cpus); - WARN_ON(xen_smp_intr_init(0)); - xen_init_lock_cpu(0); + if (xen_have_vector_callback) { + WARN_ON(xen_smp_intr_init(0)); + xen_init_lock_cpu(0); + } for_each_possible_cpu(cpu) { if (cpu == 0) @@ -50,9 +52,11 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus) static void xen_hvm_cpu_die(unsigned int cpu) { if (common_cpu_die(cpu) == 0) { - xen_smp_intr_free(cpu); - xen_uninit_lock_cpu(cpu); - xen_teardown_timer(cpu); + if (xen_have_vector_callback) { + xen_smp_intr_free(cpu); + xen_uninit_lock_cpu(cpu); + xen_teardown_timer(cpu); + } } } #else @@ -64,14 +68,17 @@ static void xen_hvm_cpu_die(unsigned int cpu) void __init xen_hvm_smp_init(void) { - if (!xen_have_vector_callback) - return; - + smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus; - smp_ops.smp_send_reschedule = xen_smp_send_reschedule; + smp_ops.smp_cpus_done = xen_smp_cpus_done; smp_ops.cpu_die = xen_hvm_cpu_die; + + if (!xen_have_vector_callback) { + nopvspin = true; + return; + } + + smp_ops.smp_send_reschedule = xen_smp_send_reschedule; smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi; smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi; - smp_ops.smp_prepare_boot_cpu = xen_hvm_smp_prepare_boot_cpu; - smp_ops.smp_cpus_done = xen_smp_cpus_done; } From b4411616c26f26c4017b8fa4d3538b1a02028733 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 13 Jan 2021 12:42:24 +0000 Subject: [PATCH 099/241] io_uring: fix null-deref in io_disable_sqo_submit general protection fault, probably for non-canonical address 0xdffffc0000000022: 0000 [#1] KASAN: null-ptr-deref in range [0x0000000000000110-0x0000000000000117] RIP: 0010:io_ring_set_wakeup_flag fs/io_uring.c:6929 [inline] RIP: 0010:io_disable_sqo_submit+0xdb/0x130 fs/io_uring.c:8891 Call Trace: io_uring_create fs/io_uring.c:9711 [inline] io_uring_setup+0x12b1/0x38e0 fs/io_uring.c:9739 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 io_disable_sqo_submit() might be called before user rings were allocated, don't do io_ring_set_wakeup_flag() in those cases. Reported-by: syzbot+ab412638aeb652ded540@syzkaller.appspotmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index b0e6d8e607a3..66db2c46ab82 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -8895,7 +8895,8 @@ static void io_disable_sqo_submit(struct io_ring_ctx *ctx) mutex_unlock(&ctx->uring_lock); /* make sure callers enter the ring to get error */ - io_ring_set_wakeup_flag(ctx); + if (ctx->rings) + io_ring_set_wakeup_flag(ctx); } /* From 06585c497b55045ec21aa8128e340f6a6587351c Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Wed, 13 Jan 2021 12:42:25 +0000 Subject: [PATCH 100/241] io_uring: do sqo disable on install_fd error WARNING: CPU: 0 PID: 8494 at fs/io_uring.c:8717 io_ring_ctx_wait_and_kill+0x4f2/0x600 fs/io_uring.c:8717 Call Trace: io_uring_release+0x3e/0x50 fs/io_uring.c:8759 __fput+0x283/0x920 fs/file_table.c:280 task_work_run+0xdd/0x190 kernel/task_work.c:140 tracehook_notify_resume include/linux/tracehook.h:189 [inline] exit_to_user_mode_loop kernel/entry/common.c:174 [inline] exit_to_user_mode_prepare+0x249/0x250 kernel/entry/common.c:201 __syscall_exit_to_user_mode_work kernel/entry/common.c:291 [inline] syscall_exit_to_user_mode+0x19/0x50 kernel/entry/common.c:302 entry_SYSCALL_64_after_hwframe+0x44/0xa9 failed io_uring_install_fd() is a special case, we don't do io_ring_ctx_wait_and_kill() directly but defer it to fput, though still need to io_disable_sqo_submit() before. note: it doesn't fix any real problem, just a warning. That's because sqring won't be available to the userspace in this case and so SQPOLL won't submit anything. Reported-by: syzbot+9c9c35374c0ecac06516@syzkaller.appspotmail.com Fixes: d9d05217cb69 ("io_uring: stop SQPOLL submit on creator's death") Signed-off-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 66db2c46ab82..372be9caf340 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9708,6 +9708,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, */ ret = io_uring_install_fd(ctx, file); if (ret < 0) { + io_disable_sqo_submit(ctx); /* fput will clean it up */ fput(file); return ret; From 77b6ec01c29aade01701aa30bf1469acc7f2be76 Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Tue, 5 Jan 2021 12:21:26 -0800 Subject: [PATCH 101/241] cifs: check pointer before freeing clang static analysis reports this problem dfs_cache.c:591:2: warning: Argument to kfree() is a constant address (18446744073709551614), which is not memory allocated by malloc() kfree(vi); ^~~~~~~~~ In dfs_cache_del_vol() the volume info pointer 'vi' being freed is the return of a call to find_vol(). The large constant address is find_vol() returning an error. Add an error check to dfs_cache_del_vol() similar to the one done in dfs_cache_update_vol(). Fixes: 54be1f6c1c37 ("cifs: Add DFS cache routines") Signed-off-by: Tom Rix Reviewed-by: Nathan Chancellor CC: # v5.0+ Signed-off-by: Steve French --- fs/cifs/dfs_cache.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 6ad6ba5f6ebe..0fdb0de7ff86 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -1260,7 +1260,8 @@ void dfs_cache_del_vol(const char *fullpath) vi = find_vol(fullpath); spin_unlock(&vol_list_lock); - kref_put(&vi->refcnt, vol_release); + if (!IS_ERR(vi)) + kref_put(&vi->refcnt, vol_release); } /** From 2659d3bff3e1b000f49907d0839178b101a89887 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Wed, 13 Jan 2021 14:16:16 -0300 Subject: [PATCH 102/241] cifs: fix interrupted close commands Retry close command if it gets interrupted to not leak open handles on the server. Signed-off-by: Paulo Alcantara (SUSE) Reported-by: Duncan Findlay Suggested-by: Pavel Shilovsky Fixes: 6988a619f5b7 ("cifs: allow syscalls to be restarted in __smb_send_rqst()") Cc: stable@vger.kernel.org Reviewd-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 067eb44c7baa..794fc3b68b4f 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -3248,7 +3248,7 @@ close_exit: free_rsp_buf(resp_buftype, rsp); /* retry close in a worker thread if this one is interrupted */ - if (rc == -EINTR) { + if (is_interrupt_error(rc)) { int tmp_rc; tmp_rc = smb2_handle_cancelled_close(tcon, persistent_fid, From c13e7af042270724b42a466edc48a70a43f571f2 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Tue, 12 Jan 2021 01:13:40 -0800 Subject: [PATCH 103/241] fs: cifs: remove unneeded variable in smb3_fs_context_dup 'rc' in smb3_fs_context_dup is not used and can be removed. Signed-off-by: Menglong Dong Reviewed-by: Aurelien Aptel Signed-off-by: Steve French --- fs/cifs/fs_context.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 0afccbbed2e6..076bcadc756a 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -303,8 +303,6 @@ do { \ int smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx) { - int rc = 0; - memcpy(new_ctx, ctx, sizeof(*ctx)); new_ctx->prepath = NULL; new_ctx->mount_options = NULL; @@ -327,7 +325,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx DUP_CTX_STR(nodename); DUP_CTX_STR(iocharset); - return rc; + return 0; } static int From ed6b1920f84bc5c3d666dc383ff3bbc60f0f62a5 Mon Sep 17 00:00:00 2001 From: YANG LI Date: Mon, 11 Jan 2021 17:15:28 +0800 Subject: [PATCH 104/241] cifs: connect: style: Simplify bool comparison Fix the following coccicheck warning: ./fs/cifs/connect.c:3740:6-21: WARNING: Comparison of 0/1 to bool variable Signed-off-by: YANG LI Reported-by: Abaci Robot Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b9df85506938..5d39129406ea 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3740,7 +3740,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (!ses->binding) { ses->capabilities = server->capabilities; - if (linuxExtEnabled == 0) + if (!linuxExtEnabled) ses->capabilities &= (~server->vals->cap_unix); if (ses->auth_key.response) { From e54fd0716c3db20c0cba73fee2c3a4274b08c24e Mon Sep 17 00:00:00 2001 From: YANG LI Date: Wed, 30 Dec 2020 14:35:45 +0800 Subject: [PATCH 105/241] cifs: style: replace one-element array with flexible-array There is a regular need in the kernel to provide a way to declare having a dynamically sized set of trailing elements in a structure. Kernel code should always use "flexible array members"[1] for these cases. The older style of one-element or zero-length arrays should no longer be used[2]. [1] https://en.wikipedia.org/wiki/Flexible_array_member [2] https://www.kernel.org/doc/html/v5.9/process/ deprecated.html#zero-length-and-one-element-arrays Signed-off-by: YANG LI Reported-by: Abaci Signed-off-by: Steve French --- fs/cifs/smb2pdu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 204a622b89ed..d85edf5d1429 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -424,7 +424,7 @@ struct smb2_rdma_transform_capabilities_context { __le16 TransformCount; __u16 Reserved1; __u32 Reserved2; - __le16 RDMATransformIds[1]; + __le16 RDMATransformIds[]; } __packed; /* Signing algorithms */ From 7ac6ad051150592557520b45773201b987ecfce3 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 12 Jan 2021 15:42:54 -0800 Subject: [PATCH 106/241] bpf: Reject too big ctx_size_in for raw_tp test run syzbot reported a WARNING for allocating too big memory: WARNING: CPU: 1 PID: 8484 at mm/page_alloc.c:4976 __alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:5011 Modules linked in: CPU: 1 PID: 8484 Comm: syz-executor862 Not tainted 5.11.0-rc2-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:__alloc_pages_nodemask+0x5f8/0x730 mm/page_alloc.c:4976 Code: 00 00 0c 00 0f 85 a7 00 00 00 8b 3c 24 4c 89 f2 44 89 e6 c6 44 24 70 00 48 89 6c 24 58 e8 d0 d7 ff ff 49 89 c5 e9 ea fc ff ff <0f> 0b e9 b5 fd ff ff 89 74 24 14 4c 89 4c 24 08 4c 89 74 24 18 e8 RSP: 0018:ffffc900012efb10 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 1ffff9200025df66 RCX: 0000000000000000 RDX: 0000000000000000 RSI: dffffc0000000000 RDI: 0000000000140dc0 RBP: 0000000000140dc0 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff81b1f7e1 R11: 0000000000000000 R12: 0000000000000014 R13: 0000000000000014 R14: 0000000000000000 R15: 0000000000000000 FS: 000000000190c880(0000) GS:ffff8880b9e00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f08b7f316c0 CR3: 0000000012073000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: alloc_pages_current+0x18c/0x2a0 mm/mempolicy.c:2267 alloc_pages include/linux/gfp.h:547 [inline] kmalloc_order+0x2e/0xb0 mm/slab_common.c:837 kmalloc_order_trace+0x14/0x120 mm/slab_common.c:853 kmalloc include/linux/slab.h:557 [inline] kzalloc include/linux/slab.h:682 [inline] bpf_prog_test_run_raw_tp+0x4b5/0x670 net/bpf/test_run.c:282 bpf_prog_test_run kernel/bpf/syscall.c:3120 [inline] __do_sys_bpf+0x1ea9/0x4f10 kernel/bpf/syscall.c:4398 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x440499 Code: 18 89 d0 c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 7b 13 fc ff c3 66 2e 0f 1f 84 00 00 00 00 RSP: 002b:00007ffe1f3bfb18 EFLAGS: 00000246 ORIG_RAX: 0000000000000141 RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 0000000000440499 RDX: 0000000000000048 RSI: 0000000020000600 RDI: 000000000000000a RBP: 00000000006ca018 R08: 0000000000000000 R09: 00000000004002c8 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000401ca0 R13: 0000000000401d30 R14: 0000000000000000 R15: 0000000000000000 This is because we didn't filter out too big ctx_size_in. Fix it by rejecting ctx_size_in that are bigger than MAX_BPF_FUNC_ARGS (12) u64 numbers. Fixes: 1b4d60ec162f ("bpf: Enable BPF_PROG_TEST_RUN for raw_tracepoint") Reported-by: syzbot+4f98876664c7337a4ae6@syzkaller.appspotmail.com Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20210112234254.1906829-1-songliubraving@fb.com --- net/bpf/test_run.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index c1c30a9f76f3..8b796c499cbb 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -272,7 +272,8 @@ int bpf_prog_test_run_raw_tp(struct bpf_prog *prog, kattr->test.repeat) return -EINVAL; - if (ctx_size_in < prog->aux->max_ctx_offset) + if (ctx_size_in < prog->aux->max_ctx_offset || + ctx_size_in > MAX_BPF_FUNC_ARGS * sizeof(u64)) return -EINVAL; if ((kattr->test.flags & BPF_F_TEST_RUN_ON_CPU) == 0 && cpu != 0) From 744ea4e3885eccb6d332a06fae9eb7420a622c0f Mon Sep 17 00:00:00 2001 From: Gilad Reti Date: Wed, 13 Jan 2021 07:38:07 +0200 Subject: [PATCH 107/241] bpf: Support PTR_TO_MEM{,_OR_NULL} register spilling Add support for pointer to mem register spilling, to allow the verifier to track pointers to valid memory addresses. Such pointers are returned for example by a successful call of the bpf_ringbuf_reserve helper. The patch was partially contributed by CyberArk Software, Inc. Fixes: 457f44363a88 ("bpf: Implement BPF ring buffer and verifier support for it") Suggested-by: Yonghong Song Signed-off-by: Gilad Reti Signed-off-by: Alexei Starovoitov Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210113053810.13518-1-gilad.reti@gmail.com --- kernel/bpf/verifier.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 17270b8404f1..36af69fac591 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2217,6 +2217,8 @@ static bool is_spillable_regtype(enum bpf_reg_type type) case PTR_TO_RDWR_BUF: case PTR_TO_RDWR_BUF_OR_NULL: case PTR_TO_PERCPU_BTF_ID: + case PTR_TO_MEM: + case PTR_TO_MEM_OR_NULL: return true; default: return false; From 4237e9f4a96228ccc8a7abe5e4b30834323cd353 Mon Sep 17 00:00:00 2001 From: Gilad Reti Date: Wed, 13 Jan 2021 07:38:08 +0200 Subject: [PATCH 108/241] selftests/bpf: Add verifier test for PTR_TO_MEM spill Add a test to check that the verifier is able to recognize spilling of PTR_TO_MEM registers, by reserving a ringbuf buffer, forcing the spill of a pointer holding the buffer address to the stack, filling it back in from the stack and writing to the memory area pointed by it. The patch was partially contributed by CyberArk Software, Inc. Signed-off-by: Gilad Reti Signed-off-by: Alexei Starovoitov Acked-by: Yonghong Song Acked-by: KP Singh Link: https://lore.kernel.org/bpf/20210113053810.13518-2-gilad.reti@gmail.com --- tools/testing/selftests/bpf/test_verifier.c | 12 +++++++- .../selftests/bpf/verifier/spill_fill.c | 30 +++++++++++++++++++ 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c index 777a81404fdb..f8569f04064b 100644 --- a/tools/testing/selftests/bpf/test_verifier.c +++ b/tools/testing/selftests/bpf/test_verifier.c @@ -50,7 +50,7 @@ #define MAX_INSNS BPF_MAXINSNS #define MAX_TEST_INSNS 1000000 #define MAX_FIXUPS 8 -#define MAX_NR_MAPS 20 +#define MAX_NR_MAPS 21 #define MAX_TEST_RUNS 8 #define POINTER_VALUE 0xcafe4all #define TEST_DATA_LEN 64 @@ -87,6 +87,7 @@ struct bpf_test { int fixup_sk_storage_map[MAX_FIXUPS]; int fixup_map_event_output[MAX_FIXUPS]; int fixup_map_reuseport_array[MAX_FIXUPS]; + int fixup_map_ringbuf[MAX_FIXUPS]; const char *errstr; const char *errstr_unpriv; uint32_t insn_processed; @@ -640,6 +641,7 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, int *fixup_sk_storage_map = test->fixup_sk_storage_map; int *fixup_map_event_output = test->fixup_map_event_output; int *fixup_map_reuseport_array = test->fixup_map_reuseport_array; + int *fixup_map_ringbuf = test->fixup_map_ringbuf; if (test->fill_helper) { test->fill_insns = calloc(MAX_TEST_INSNS, sizeof(struct bpf_insn)); @@ -817,6 +819,14 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, fixup_map_reuseport_array++; } while (*fixup_map_reuseport_array); } + if (*fixup_map_ringbuf) { + map_fds[20] = create_map(BPF_MAP_TYPE_RINGBUF, 0, + 0, 4096); + do { + prog[*fixup_map_ringbuf].imm = map_fds[20]; + fixup_map_ringbuf++; + } while (*fixup_map_ringbuf); + } } struct libcap { diff --git a/tools/testing/selftests/bpf/verifier/spill_fill.c b/tools/testing/selftests/bpf/verifier/spill_fill.c index 45d43bf82f26..0b943897aaf6 100644 --- a/tools/testing/selftests/bpf/verifier/spill_fill.c +++ b/tools/testing/selftests/bpf/verifier/spill_fill.c @@ -28,6 +28,36 @@ .result = ACCEPT, .result_unpriv = ACCEPT, }, +{ + "check valid spill/fill, ptr to mem", + .insns = { + /* reserve 8 byte ringbuf memory */ + BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), + BPF_LD_MAP_FD(BPF_REG_1, 0), + BPF_MOV64_IMM(BPF_REG_2, 8), + BPF_MOV64_IMM(BPF_REG_3, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_reserve), + /* store a pointer to the reserved memory in R6 */ + BPF_MOV64_REG(BPF_REG_6, BPF_REG_0), + /* check whether the reservation was successful */ + BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 6), + /* spill R6(mem) into the stack */ + BPF_STX_MEM(BPF_DW, BPF_REG_10, BPF_REG_6, -8), + /* fill it back in R7 */ + BPF_LDX_MEM(BPF_DW, BPF_REG_7, BPF_REG_10, -8), + /* should be able to access *(R7) = 0 */ + BPF_ST_MEM(BPF_DW, BPF_REG_7, 0, 0), + /* submit the reserved ringbuf memory */ + BPF_MOV64_REG(BPF_REG_1, BPF_REG_7), + BPF_MOV64_IMM(BPF_REG_2, 0), + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_ringbuf_submit), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .fixup_map_ringbuf = { 1 }, + .result = ACCEPT, + .result_unpriv = ACCEPT, +}, { "check corrupted spill/fill", .insns = { From c25a053e15778f6b4d6553708673736e27a6c2cf Mon Sep 17 00:00:00 2001 From: Nick Hu Date: Wed, 13 Jan 2021 10:24:10 +0800 Subject: [PATCH 109/241] riscv: Fix KASAN memory mapping. Use virtual address instead of physical address when translating the address to shadow memory by kasan_mem_to_shadow(). Signed-off-by: Nick Hu Signed-off-by: Nylon Chen Fixes: b10d6bca8720 ("arch, drivers: replace for_each_membock() with for_each_mem_range()") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/kasan_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/kasan_init.c b/arch/riscv/mm/kasan_init.c index 12ddd1f6bf70..a8a2ffd9114a 100644 --- a/arch/riscv/mm/kasan_init.c +++ b/arch/riscv/mm/kasan_init.c @@ -93,8 +93,8 @@ void __init kasan_init(void) VMALLOC_END)); for_each_mem_range(i, &_start, &_end) { - void *start = (void *)_start; - void *end = (void *)_end; + void *start = (void *)__va(_start); + void *end = (void *)__va(_end); if (start >= end) break; From 41131a5e54ae7ba5a2bb8d7b30d1818b3f5b13d2 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Tue, 12 Jan 2021 11:55:15 +0000 Subject: [PATCH 110/241] powerpc/vdso: Fix clock_gettime_fallback for vdso32 The second argument of __kernel_clock_gettime64 points to a struct __kernel_timespec, with 64-bit time_t, so use the clock_gettime64 syscall in the fallback function for the 32-bit VDSO. Similarly, clock_getres_fallback should use the clock_getres_time64 syscall, though it isn't yet called from the 32-bit VDSO. Fixes: d0e3fc69d00d ("powerpc/vdso: Provide __kernel_clock_gettime64() on vdso32") Signed-off-by: Andreas Schwab [chleroy: Moved into a single #ifdef __powerpc64__ block] Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/0c0ab0eb3cc80687c326f76ff0dd5762b8812ecc.1610452505.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/vdso/gettimeofday.h | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/vdso/gettimeofday.h b/arch/powerpc/include/asm/vdso/gettimeofday.h index 81671aa365b3..77c635c2c90d 100644 --- a/arch/powerpc/include/asm/vdso/gettimeofday.h +++ b/arch/powerpc/include/asm/vdso/gettimeofday.h @@ -103,6 +103,8 @@ int gettimeofday_fallback(struct __kernel_old_timeval *_tv, struct timezone *_tz return do_syscall_2(__NR_gettimeofday, (unsigned long)_tv, (unsigned long)_tz); } +#ifdef __powerpc64__ + static __always_inline int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) { @@ -115,10 +117,22 @@ int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) return do_syscall_2(__NR_clock_getres, _clkid, (unsigned long)_ts); } -#ifdef CONFIG_VDSO32 +#else #define BUILD_VDSO32 1 +static __always_inline +int clock_gettime_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + return do_syscall_2(__NR_clock_gettime64, _clkid, (unsigned long)_ts); +} + +static __always_inline +int clock_getres_fallback(clockid_t _clkid, struct __kernel_timespec *_ts) +{ + return do_syscall_2(__NR_clock_getres_time64, _clkid, (unsigned long)_ts); +} + static __always_inline int clock_gettime32_fallback(clockid_t _clkid, struct old_timespec32 *_ts) { From be969b7cfbcfa8a835a528f1dc467f0975c6d883 Mon Sep 17 00:00:00 2001 From: Sagar Shrikant Kadam Date: Tue, 10 Nov 2020 07:22:10 -0800 Subject: [PATCH 111/241] dts: phy: fix missing mdio device and probe failure of vsc8541-01 device HiFive unleashed A00 board has VSC8541-01 ethernet phy, this device is identified as a Revision B device as described in device identification registers. In order to use this phy in the unmanaged mode, it requires a specific reset sequence of logical 0-1-0-1 transition on the NRESET pin as documented here [1]. Currently, the bootloader (fsbl or u-boot-spl) takes care of the phy reset. If due to some reason the phy device hasn't received the reset by the prior stages before the linux macb driver comes into the picture, the MACB mii bus gets probed but the mdio scan fails and is not even able to read the phy ID registers. It gives an error message: "libphy: MACB_mii_bus: probed mdio_bus 10090000.ethernet-ffffffff: MDIO device at address 0 is missing." Thus adding the device OUI (Organizationally Unique Identifier) to the phy device node helps to probe the phy device. [1]: VSC8541-01 datasheet: https://www.mouser.com/ds/2/523/Microsemi_VSC8541-01_Datasheet_10496_V40-1148034.pdf Signed-off-by: Sagar Shrikant Kadam Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts index 4a2729f5ca3f..60846e88ae4b 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts @@ -88,6 +88,7 @@ phy-mode = "gmii"; phy-handle = <&phy0>; phy0: ethernet-phy@0 { + compatible = "ethernet-phy-id0007.0771"; reg = <0>; }; }; From a0fa9d727043da2238432471e85de0bdb8a8df65 Mon Sep 17 00:00:00 2001 From: Sagar Shrikant Kadam Date: Tue, 10 Nov 2020 07:22:11 -0800 Subject: [PATCH 112/241] dts: phy: add GPIO number and active state used for phy reset The GEMGXL_RST line on HiFive Unleashed is pulled low and is using GPIO number 12. Add these reset-gpio details to dt-node using which the linux phylib can reset the phy. Signed-off-by: Sagar Shrikant Kadam Signed-off-by: Palmer Dabbelt --- arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts index 60846e88ae4b..24d75a146e02 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts @@ -90,6 +90,7 @@ phy0: ethernet-phy@0 { compatible = "ethernet-phy-id0007.0771"; reg = <0>; + reset-gpios = <&gpio 12 GPIO_ACTIVE_LOW>; }; }; From 0983834a83931606a647c275e5d4165ce4e7b49f Mon Sep 17 00:00:00 2001 From: Sagar Shrikant Kadam Date: Tue, 10 Nov 2020 07:22:12 -0800 Subject: [PATCH 113/241] riscv: defconfig: enable gpio support for HiFive Unleashed Ethernet phy VSC8541-01 on HiFive Unleashed has its reset line connected to a gpio, so enable GPIO driver's required to reset the phy. Signed-off-by: Sagar Shrikant Kadam Signed-off-by: Palmer Dabbelt --- arch/riscv/configs/defconfig | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index d222d353d86d..8c3d1e451703 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -64,6 +64,8 @@ CONFIG_HW_RANDOM=y CONFIG_HW_RANDOM_VIRTIO=y CONFIG_SPI=y CONFIG_SPI_SIFIVE=y +CONFIG_GPIOLIB=y +CONFIG_GPIO_SIFIVE=y # CONFIG_PTP_1588_CLOCK is not set CONFIG_POWER_RESET=y CONFIG_DRM=y From 101c2fae5108d78915517d0279323ee215e70df2 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jan 2021 15:14:29 -0500 Subject: [PATCH 114/241] MAINTAINERS: update radeon/amdgpu/amdkfd git trees MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FDO is out of space, so move to gitlab. Reviewed-by: Christian König Signed-off-by: Alex Deucher --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cc1e6a5ee6e6..82a47081f62a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -907,7 +907,7 @@ AMD KFD M: Felix Kuehling L: amd-gfx@lists.freedesktop.org S: Supported -T: git git://people.freedesktop.org/~agd5f/linux +T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd*.[ch] F: drivers/gpu/drm/amd/amdkfd/ F: drivers/gpu/drm/amd/include/cik_structs.h @@ -14818,7 +14818,7 @@ M: Alex Deucher M: Christian König L: amd-gfx@lists.freedesktop.org S: Supported -T: git git://people.freedesktop.org/~agd5f/linux +T: git https://gitlab.freedesktop.org/agd5f/linux.git F: drivers/gpu/drm/amd/ F: drivers/gpu/drm/radeon/ F: include/uapi/drm/amdgpu_drm.h From ff9346dbabbb6595c5c20d90d88ae4a2247487a9 Mon Sep 17 00:00:00 2001 From: Alexandre Demers Date: Thu, 7 Jan 2021 18:53:03 -0500 Subject: [PATCH 115/241] drm/amdgpu: fix DRM_INFO flood if display core is not supported (bug 210921) This fix bug 210921 where DRM_INFO floods log when hitting an unsupported ASIC in amdgpu_device_asic_has_dc_support(). This info should be only called once. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=210921 Signed-off-by: Alexandre Demers Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b69c34074d8d..087afab67e22 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3034,7 +3034,7 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type) #endif default: if (amdgpu_dc > 0) - DRM_INFO("Display Core has been requested via kernel parameter " + DRM_INFO_ONCE("Display Core has been requested via kernel parameter " "but isn't supported by ASIC, ignoring\n"); return false; } From f14a5c34d143f6627f0be70c0de1d962f3a6ff1c Mon Sep 17 00:00:00 2001 From: Victor Zhao Date: Tue, 5 Jan 2021 15:04:01 +0800 Subject: [PATCH 116/241] drm/amdgpu/psp: fix psp gfx ctrl cmds psp GFX_CTRL_CMD_ID_CONSUME_CMD different for windows and linux, according to psp, linux cmds are not correct. v2: only correct GFX_CTRL_CMD_ID_CONSUME_CMD. Signed-off-by: Victor Zhao Reviewed-by: Emily.Deng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h index d65a5339d354..3ba7bdfde65d 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h +++ b/drivers/gpu/drm/amd/amdgpu/psp_gfx_if.h @@ -47,7 +47,7 @@ enum psp_gfx_crtl_cmd_id GFX_CTRL_CMD_ID_DISABLE_INT = 0x00060000, /* disable PSP-to-Gfx interrupt */ GFX_CTRL_CMD_ID_MODE1_RST = 0x00070000, /* trigger the Mode 1 reset */ GFX_CTRL_CMD_ID_GBR_IH_SET = 0x00080000, /* set Gbr IH_RB_CNTL registers */ - GFX_CTRL_CMD_ID_CONSUME_CMD = 0x000A0000, /* send interrupt to psp for updating write pointer of vf */ + GFX_CTRL_CMD_ID_CONSUME_CMD = 0x00090000, /* send interrupt to psp for updating write pointer of vf */ GFX_CTRL_CMD_ID_DESTROY_GPCOM_RING = 0x000C0000, /* destroy GPCOM ring */ GFX_CTRL_CMD_ID_MAX = 0x000F0000, /* max command ID */ From 73644143b31cb95866c19e0d94be9e3127ec3a6b Mon Sep 17 00:00:00 2001 From: Qingqing Zhuo Date: Wed, 9 Dec 2020 18:04:04 -0500 Subject: [PATCH 117/241] drm/amd/display: NULL pointer hang [Why] In dc_link_dp_set_test_pattern, we assume all pipes have a stream, which can cause null pointer dereference. [How] Add a null pointer check before accessing stream. Tested-by: Daniel Wheeler Signed-off-by: Qingqing Zhuo Reviewed-by: Nicholas Kazlauskas Acked-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c index 2fc12239b22c..1bd1a0935290 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc_link_dp.c @@ -3992,7 +3992,7 @@ bool dc_link_dp_set_test_pattern( unsigned int cust_pattern_size) { struct pipe_ctx *pipes = link->dc->current_state->res_ctx.pipe_ctx; - struct pipe_ctx *pipe_ctx = &pipes[0]; + struct pipe_ctx *pipe_ctx = NULL; unsigned int lane; unsigned int i; unsigned char link_qual_pattern[LANE_COUNT_DP_MAX] = {0}; @@ -4002,12 +4002,18 @@ bool dc_link_dp_set_test_pattern( memset(&training_pattern, 0, sizeof(training_pattern)); for (i = 0; i < MAX_PIPES; i++) { + if (pipes[i].stream == NULL) + continue; + if (pipes[i].stream->link == link && !pipes[i].top_pipe && !pipes[i].prev_odm_pipe) { pipe_ctx = &pipes[i]; break; } } + if (pipe_ctx == NULL) + return false; + /* Reset CRTC Test Pattern if it is currently running and request is VideoMode */ if (link->test_pattern_enabled && test_pattern == DP_TEST_PATTERN_VIDEO_MODE) { From 4336be4b07ed3b03a18ac35564c3127eeea05ab6 Mon Sep 17 00:00:00 2001 From: Wesley Chalmers Date: Mon, 14 Dec 2020 15:35:02 -0500 Subject: [PATCH 118/241] drm/amd/display: Initialize stack variable [WHY] The stack variable "val" is potentially unpopulate it, so initialize it with the value 0xf (indicating an invalid mux) Tested-by: Daniel Wheeler Signed-off-by: Wesley Chalmers Reviewed-by: Martin Leung Acked-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c index 100ce0e28fd5..b096011acb49 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_mpc.c @@ -470,7 +470,7 @@ void mpc1_cursor_lock(struct mpc *mpc, int opp_id, bool lock) unsigned int mpc1_get_mpc_out_mux(struct mpc *mpc, int opp_id) { struct dcn10_mpc *mpc10 = TO_DCN10_MPC(mpc); - uint32_t val = 0; + uint32_t val = 0xf; if (opp_id < MAX_OPP && REG(MUX[opp_id])) REG_GET(MUX[opp_id], MPC_OUT_MUX, &val); From 0eb31a82e378cab17beec1d213e1414e9fea1767 Mon Sep 17 00:00:00 2001 From: Nikola Cornij Date: Tue, 29 Dec 2020 23:33:32 -0500 Subject: [PATCH 119/241] drm/amd/display: Add a missing DCN3.01 API mapping [why] Required for DSC MST Tested-by: Daniel Wheeler Signed-off-by: Nikola Cornij Reviewed-by: Zhan Liu Acked-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c index 4825c5c1c6ed..35f5bf08ae96 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn301/dcn301_resource.c @@ -1731,6 +1731,7 @@ static struct resource_funcs dcn301_res_pool_funcs = { .populate_dml_pipes = dcn30_populate_dml_pipes_from_context, .acquire_idle_pipe_for_layer = dcn20_acquire_idle_pipe_for_layer, .add_stream_to_ctx = dcn30_add_stream_to_ctx, + .add_dsc_to_stream_resource = dcn20_add_dsc_to_stream_resource, .remove_stream_from_ctx = dcn20_remove_stream_from_ctx, .populate_dml_writeback_from_context = dcn30_populate_dml_writeback_from_context, .set_mcif_arb_params = dcn30_set_mcif_arb_params, From 9d03bb102028b4a3f4a64d6069b219e2e1c1f306 Mon Sep 17 00:00:00 2001 From: "Li, Roman" Date: Wed, 30 Dec 2020 18:03:02 +0000 Subject: [PATCH 120/241] drm/amd/display: disable dcn10 pipe split by default [Why] The initial purpose of dcn10 pipe split is to support some high bandwidth mode which requires dispclk greater than max dispclk. By initial bring up power measurement data, it showed power consumption is less with pipe split for dcn block. This could be reason for enable pipe split by default. By battery life measurement of some Chromebooks, result shows battery life is longer with pipe split disabled. [How] Disable pipe split by default. Pipe split could be still enabled when required dispclk is greater than max dispclk. Tested-by: Daniel Wheeler Signed-off-by: Hersen Wu Signed-off-by: Roman Li Reviewed-by: Roman Li Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c index 36745193c391..90e912fef2b3 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn10/dcn10_resource.c @@ -608,8 +608,8 @@ static const struct dc_debug_options debug_defaults_drv = { .disable_pplib_clock_request = false, .disable_pplib_wm_range = false, .pplib_wm_report_mode = WM_REPORT_DEFAULT, - .pipe_split_policy = MPC_SPLIT_DYNAMIC, - .force_single_disp_pipe_split = true, + .pipe_split_policy = MPC_SPLIT_AVOID, + .force_single_disp_pipe_split = false, .disable_dcc = DCC_ENABLE, .voltage_align_fclk = true, .disable_stereo_support = true, From 4eec66c014e9a406d8d453de958f6791d05427e4 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Mon, 11 Jan 2021 11:31:51 -0500 Subject: [PATCH 121/241] Revert "drm/amd/display: Fixed Intermittent blue screen on OLED panel" commit a861736dae64 ("drm/amd/display: Fixed Intermittent blue screen on OLED panel") causes power regression for many users. It seems that this change causes the MCLK to get forced high; this creates a regression for many users since their devices were not able to drop to a low state after this change. For this reason, this reverts commit a861736dae644a0d7abbca0c638ae6aad28feeb8. Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1407 Cc: Aurabindo Pillai Cc: Alex Deucher Cc: Harry Wentland Cc: Naveed Ashfaq Cc: Hersen Wu Cc: Roman Li Acked-by: Alex Deucher Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- .../amd/display/dc/dml/dcn20/display_mode_vba_20v2.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_mode_vba_20v2.c b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_mode_vba_20v2.c index 860e72a51534..80170f9721ce 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_mode_vba_20v2.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn20/display_mode_vba_20v2.c @@ -2635,14 +2635,15 @@ static void dml20v2_DISPCLKDPPCLKDCFCLKDeepSleepPrefetchParametersWatermarksAndP } if (mode_lib->vba.DRAMClockChangeSupportsVActive && - mode_lib->vba.MinActiveDRAMClockChangeMargin > 60 && - mode_lib->vba.PrefetchMode[mode_lib->vba.VoltageLevel][mode_lib->vba.maxMpcComb] == 0) { + mode_lib->vba.MinActiveDRAMClockChangeMargin > 60) { mode_lib->vba.DRAMClockChangeWatermark += 25; for (k = 0; k < mode_lib->vba.NumberOfActivePlanes; ++k) { - if (mode_lib->vba.DRAMClockChangeWatermark > - dml_max(mode_lib->vba.StutterEnterPlusExitWatermark, mode_lib->vba.UrgentWatermark)) - mode_lib->vba.MinTTUVBlank[k] += 25; + if (mode_lib->vba.PrefetchMode[mode_lib->vba.VoltageLevel][mode_lib->vba.maxMpcComb] == 0) { + if (mode_lib->vba.DRAMClockChangeWatermark > + dml_max(mode_lib->vba.StutterEnterPlusExitWatermark, mode_lib->vba.UrgentWatermark)) + mode_lib->vba.MinTTUVBlank[k] += 25; + } } mode_lib->vba.DRAMClockChangeSupport[0][0] = dm_dram_clock_change_vactive; From 8b335bff643f3b39935c7377dbcd361c5b605d98 Mon Sep 17 00:00:00 2001 From: Jeremy Cline Date: Mon, 11 Jan 2021 16:05:28 -0500 Subject: [PATCH 122/241] drm/amdkfd: Fix out-of-bounds read in kdf_create_vcrat_image_cpu() KASAN reported a slab-out-of-bounds read of size 1 in kdf_create_vcrat_image_cpu(). This occurs when, for example, when on an x86_64 with a single NUMA node because kfd_fill_iolink_info_for_cpu() is a no-op, but afterwards the sub_type_hdr->length, which is out-of-bounds, is read and multiplied by entries. Fortunately, entries is 0 in this case so the overall crat_table->length is still correct. Check if there were any entries before de-referencing sub_type_hdr which may be pointing to out-of-bounds memory. Fixes: b7b6c38529c9 ("drm/amdkfd: Calculate CPU VCRAT size dynamically (v2)") Suggested-by: Felix Kuehling Signed-off-by: Jeremy Cline Reviewed-by: Felix Kuehling Signed-off-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_crat.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c index 8cac497c2c45..a5640a6138cf 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_crat.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_crat.c @@ -1040,11 +1040,14 @@ static int kfd_create_vcrat_image_cpu(void *pcrat_image, size_t *size) (struct crat_subtype_iolink *)sub_type_hdr); if (ret < 0) return ret; - crat_table->length += (sub_type_hdr->length * entries); - crat_table->total_entries += entries; - sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + - sub_type_hdr->length * entries); + if (entries) { + crat_table->length += (sub_type_hdr->length * entries); + crat_table->total_entries += entries; + + sub_type_hdr = (typeof(sub_type_hdr))((char *)sub_type_hdr + + sub_type_hdr->length * entries); + } #else pr_info("IO link not available for non x86 platforms\n"); #endif From 04eb6e773e9f3167a5921d74e8ad99cdcc4166c3 Mon Sep 17 00:00:00 2001 From: chen gong Date: Fri, 8 Jan 2021 12:46:44 +0800 Subject: [PATCH 123/241] drm/amdgpu/gfx10: add updated GOLDEN_TSC_COUNT_UPPER/LOWER register offsets for VGH The address of the GOLDEN_TSC_COUNT_UPPER/GOLDEN_TSC_COUNT_LOWER for Vnagogh are different from the others. The offset of the GOLDEN_TSC_COUNT_UPPER for Vangogh is 0x0025 by calculation. The offset of the GOLDEN_TSC_COUNT_LOWER for Vangogh is 0x0026 by calculation. Signed-off-by: chen gong Acked-by: Evan Quan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index ba1086784525..34141b785aa4 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -99,6 +99,10 @@ #define mmGCR_GENERAL_CNTL_Sienna_Cichlid 0x1580 #define mmGCR_GENERAL_CNTL_Sienna_Cichlid_BASE_IDX 0 +#define mmGOLDEN_TSC_COUNT_UPPER_Vangogh 0x0025 +#define mmGOLDEN_TSC_COUNT_UPPER_Vangogh_BASE_IDX 1 +#define mmGOLDEN_TSC_COUNT_LOWER_Vangogh 0x0026 +#define mmGOLDEN_TSC_COUNT_LOWER_Vangogh_BASE_IDX 1 #define mmSPI_CONFIG_CNTL_1_Vangogh 0x2441 #define mmSPI_CONFIG_CNTL_1_Vangogh_BASE_IDX 1 #define mmVGT_TF_MEMORY_BASE_HI_Vangogh 0x2261 @@ -7377,8 +7381,16 @@ static uint64_t gfx_v10_0_get_gpu_clock_counter(struct amdgpu_device *adev) amdgpu_gfx_off_ctrl(adev, false); mutex_lock(&adev->gfx.gpu_clock_mutex); - clock = (uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_LOWER) | - ((uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_UPPER) << 32ULL); + switch (adev->asic_type) { + case CHIP_VANGOGH: + clock = (uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_LOWER_Vangogh) | + ((uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_UPPER_Vangogh) << 32ULL); + break; + default: + clock = (uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_LOWER) | + ((uint64_t)RREG32_SOC15(SMUIO, 0, mmGOLDEN_TSC_COUNT_UPPER) << 32ULL); + break; + } mutex_unlock(&adev->gfx.gpu_clock_mutex); amdgpu_gfx_off_ctrl(adev, true); return clock; From 12f2df72205fe348481d941c3e593e8068d2d23d Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Wed, 13 Jan 2021 20:13:17 +0800 Subject: [PATCH 124/241] drm/amdgpu: fix vram type and bandwidth error for DDR5 and DDR4 This patch is to update atomfirmware parser for the memory type and bandwidth of DDR5 and DDR4. Signed-off-by: Huang Rui Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- .../gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c | 53 +++++++++++++------ 1 file changed, 36 insertions(+), 17 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c index 306077884a67..6107ac91db25 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_atomfirmware.c @@ -112,6 +112,7 @@ int amdgpu_atomfirmware_allocate_fb_scratch(struct amdgpu_device *adev) union igp_info { struct atom_integrated_system_info_v1_11 v11; struct atom_integrated_system_info_v1_12 v12; + struct atom_integrated_system_info_v2_1 v21; }; union umc_info { @@ -209,24 +210,42 @@ amdgpu_atomfirmware_get_vram_info(struct amdgpu_device *adev, if (adev->flags & AMD_IS_APU) { igp_info = (union igp_info *) (mode_info->atom_context->bios + data_offset); - switch (crev) { - case 11: - mem_channel_number = igp_info->v11.umachannelnumber; - /* channel width is 64 */ - if (vram_width) - *vram_width = mem_channel_number * 64; - mem_type = igp_info->v11.memorytype; - if (vram_type) - *vram_type = convert_atom_mem_type_to_vram_type(adev, mem_type); + switch (frev) { + case 1: + switch (crev) { + case 11: + case 12: + mem_channel_number = igp_info->v11.umachannelnumber; + if (!mem_channel_number) + mem_channel_number = 1; + /* channel width is 64 */ + if (vram_width) + *vram_width = mem_channel_number * 64; + mem_type = igp_info->v11.memorytype; + if (vram_type) + *vram_type = convert_atom_mem_type_to_vram_type(adev, mem_type); + break; + default: + return -EINVAL; + } break; - case 12: - mem_channel_number = igp_info->v12.umachannelnumber; - /* channel width is 64 */ - if (vram_width) - *vram_width = mem_channel_number * 64; - mem_type = igp_info->v12.memorytype; - if (vram_type) - *vram_type = convert_atom_mem_type_to_vram_type(adev, mem_type); + case 2: + switch (crev) { + case 1: + case 2: + mem_channel_number = igp_info->v21.umachannelnumber; + if (!mem_channel_number) + mem_channel_number = 1; + /* channel width is 64 */ + if (vram_width) + *vram_width = mem_channel_number * 64; + mem_type = igp_info->v21.memorytype; + if (vram_type) + *vram_type = convert_atom_mem_type_to_vram_type(adev, mem_type); + break; + default: + return -EINVAL; + } break; default: return -EINVAL; From 21702c8cae51535e09b91341a069503c6ef3d2a3 Mon Sep 17 00:00:00 2001 From: Prike Liang Date: Fri, 2 Oct 2020 10:58:55 -0400 Subject: [PATCH 125/241] drm/amdgpu: add green_sardine device id (v2) Add green_sardine PCI id support and map it to renoir asic type. v2: add apu flag Signed-off-by: Prike Liang Reviewed-by: Alex Deucher Reviewed-by: Huang Rui Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.10.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 72efd579ec5e..e191383f34f7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1085,6 +1085,7 @@ static const struct pci_device_id pciidlist[] = { /* Renoir */ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, + {0x1002, 0x1638, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, /* Navi12 */ {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12}, From 53f1e7f6a1720f8299b5283857eedc8f07d29533 Mon Sep 17 00:00:00 2001 From: mengwang Date: Wed, 12 Aug 2020 11:49:29 +0800 Subject: [PATCH 126/241] drm/amdgpu: add new device id for Renior add DID 0x164C into pciidlist under CHIP_RENOIR family. Signed-off-by: mengwang Reviewed-by: Huang Rui Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 5.10.x --- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 + drivers/gpu/drm/amd/amdgpu/soc15.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index e191383f34f7..7169fb5e3d9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -1086,6 +1086,7 @@ static const struct pci_device_id pciidlist[] = { /* Renoir */ {0x1002, 0x1636, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, {0x1002, 0x1638, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, + {0x1002, 0x164C, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_RENOIR|AMD_IS_APU}, /* Navi12 */ {0x1002, 0x7360, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_NAVI12}, diff --git a/drivers/gpu/drm/amd/amdgpu/soc15.c b/drivers/gpu/drm/amd/amdgpu/soc15.c index 8a23636ecc27..0b3516c4eefb 100644 --- a/drivers/gpu/drm/amd/amdgpu/soc15.c +++ b/drivers/gpu/drm/amd/amdgpu/soc15.c @@ -1239,7 +1239,8 @@ static int soc15_common_early_init(void *handle) break; case CHIP_RENOIR: adev->asic_funcs = &soc15_asic_funcs; - if (adev->pdev->device == 0x1636) + if ((adev->pdev->device == 0x1636) || + (adev->pdev->device == 0x164c)) adev->apu_flags |= AMD_APU_IS_RENOIR; else adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE; From 3c516e038f0cc3915825bdac619d448c2b1811f2 Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Thu, 14 Jan 2021 15:19:23 +0800 Subject: [PATCH 127/241] Documentation: ACPI: EINJ: Fix error type values for PCIe errors Fix the error type value for PCI Express uncorrectable non-fatal error to 0x00000080 and fix the error type value for PCI Express uncorrectable fatal error to 0x00000100. See Advanced Configuration and Power Interface Specification, version 6.2, table "18-409 Error Type Definition". Signed-off-by: Qiuxu Zhuo Reported-by: Lijian Zhao [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- Documentation/firmware-guide/acpi/apei/einj.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/firmware-guide/acpi/apei/einj.rst b/Documentation/firmware-guide/acpi/apei/einj.rst index e588bccf5158..c042176e1707 100644 --- a/Documentation/firmware-guide/acpi/apei/einj.rst +++ b/Documentation/firmware-guide/acpi/apei/einj.rst @@ -50,8 +50,8 @@ The following files belong to it: 0x00000010 Memory Uncorrectable non-fatal 0x00000020 Memory Uncorrectable fatal 0x00000040 PCI Express Correctable - 0x00000080 PCI Express Uncorrectable fatal - 0x00000100 PCI Express Uncorrectable non-fatal + 0x00000080 PCI Express Uncorrectable non-fatal + 0x00000100 PCI Express Uncorrectable fatal 0x00000200 Platform Correctable 0x00000400 Platform Uncorrectable non-fatal 0x00000800 Platform Uncorrectable fatal From 7a2da5d7960a64ee923fe3e31f01a1101052c66f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 14 Jan 2021 13:09:37 +0000 Subject: [PATCH 128/241] spi: fsl: Fix driver breakage when SPI_CS_HIGH is not set in spi->mode Commit 766c6b63aa04 ("spi: fix client driver breakages when using GPIO descriptors") broke fsl spi driver. As now we fully rely on gpiolib for handling the polarity of chip selects, the driver shall not alter the GPIO value anymore when SPI_CS_HIGH is not set in spi->mode. Fixes: 766c6b63aa04 ("spi: fix client driver breakages when using GPIO descriptors") Cc: stable@vger.kernel.org Signed-off-by: Christophe Leroy Link: https://lore.kernel.org/r/6b51cc2bfbca70d3e9b9da7b7aa4c7a9d793ca0e.1610629002.git.christophe.leroy@csgroup.eu Signed-off-by: Mark Brown --- drivers/spi/spi-fsl-spi.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/spi/spi-fsl-spi.c b/drivers/spi/spi-fsl-spi.c index 9494257e1c33..6d8e0a05a535 100644 --- a/drivers/spi/spi-fsl-spi.c +++ b/drivers/spi/spi-fsl-spi.c @@ -115,14 +115,13 @@ static void fsl_spi_chipselect(struct spi_device *spi, int value) { struct mpc8xxx_spi *mpc8xxx_spi = spi_master_get_devdata(spi->master); struct fsl_spi_platform_data *pdata; - bool pol = spi->mode & SPI_CS_HIGH; struct spi_mpc8xxx_cs *cs = spi->controller_state; pdata = spi->dev.parent->parent->platform_data; if (value == BITBANG_CS_INACTIVE) { if (pdata->cs_control) - pdata->cs_control(spi, !pol); + pdata->cs_control(spi, false); } if (value == BITBANG_CS_ACTIVE) { @@ -134,7 +133,7 @@ static void fsl_spi_chipselect(struct spi_device *spi, int value) fsl_spi_change_mode(spi); if (pdata->cs_control) - pdata->cs_control(spi, pol); + pdata->cs_control(spi, true); } } From c87a95dc28b1431c7e77e2c0c983cf37698089d2 Mon Sep 17 00:00:00 2001 From: Ignat Korchagin Date: Wed, 13 Jan 2021 19:17:17 +0000 Subject: [PATCH 129/241] dm crypt: defer decryption to a tasklet if interrupts disabled On some specific hardware on early boot we occasionally get: [ 1193.920255][ T0] BUG: sleeping function called from invalid context at mm/mempool.c:381 [ 1193.936616][ T0] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/69 [ 1193.953233][ T0] no locks held by swapper/69/0. [ 1193.965871][ T0] irq event stamp: 575062 [ 1193.977724][ T0] hardirqs last enabled at (575061): [] tick_nohz_idle_exit+0xe2/0x3e0 [ 1194.002762][ T0] hardirqs last disabled at (575062): [] flush_smp_call_function_from_idle+0x4f/0x80 [ 1194.029035][ T0] softirqs last enabled at (575050): [] asm_call_irq_on_stack+0x12/0x20 [ 1194.054227][ T0] softirqs last disabled at (575043): [] asm_call_irq_on_stack+0x12/0x20 [ 1194.079389][ T0] CPU: 69 PID: 0 Comm: swapper/69 Not tainted 5.10.6-cloudflare-kasan-2021.1.4-dev #1 [ 1194.104103][ T0] Hardware name: NULL R162-Z12-CD/MZ12-HD4-CD, BIOS R10 06/04/2020 [ 1194.119591][ T0] Call Trace: [ 1194.130233][ T0] dump_stack+0x9a/0xcc [ 1194.141617][ T0] ___might_sleep.cold+0x180/0x1b0 [ 1194.153825][ T0] mempool_alloc+0x16b/0x300 [ 1194.165313][ T0] ? remove_element+0x160/0x160 [ 1194.176961][ T0] ? blk_mq_end_request+0x4b/0x490 [ 1194.188778][ T0] crypt_convert+0x27f6/0x45f0 [dm_crypt] [ 1194.201024][ T0] ? rcu_read_lock_sched_held+0x3f/0x70 [ 1194.212906][ T0] ? module_assert_mutex_or_preempt+0x3e/0x70 [ 1194.225318][ T0] ? __module_address.part.0+0x1b/0x3a0 [ 1194.237212][ T0] ? is_kernel_percpu_address+0x5b/0x190 [ 1194.249238][ T0] ? crypt_iv_tcw_ctr+0x4a0/0x4a0 [dm_crypt] [ 1194.261593][ T0] ? is_module_address+0x25/0x40 [ 1194.272905][ T0] ? static_obj+0x8a/0xc0 [ 1194.283582][ T0] ? lockdep_init_map_waits+0x26a/0x700 [ 1194.295570][ T0] ? __raw_spin_lock_init+0x39/0x110 [ 1194.307330][ T0] kcryptd_crypt_read_convert+0x31c/0x560 [dm_crypt] [ 1194.320496][ T0] ? kcryptd_queue_crypt+0x1be/0x380 [dm_crypt] [ 1194.333203][ T0] blk_update_request+0x6d7/0x1500 [ 1194.344841][ T0] ? blk_mq_trigger_softirq+0x190/0x190 [ 1194.356831][ T0] blk_mq_end_request+0x4b/0x490 [ 1194.367994][ T0] ? blk_mq_trigger_softirq+0x190/0x190 [ 1194.379693][ T0] flush_smp_call_function_queue+0x24b/0x560 [ 1194.391847][ T0] flush_smp_call_function_from_idle+0x59/0x80 [ 1194.403969][ T0] do_idle+0x287/0x450 [ 1194.413891][ T0] ? arch_cpu_idle_exit+0x40/0x40 [ 1194.424716][ T0] ? lockdep_hardirqs_on_prepare+0x286/0x3f0 [ 1194.436399][ T0] ? _raw_spin_unlock_irqrestore+0x39/0x40 [ 1194.447759][ T0] cpu_startup_entry+0x19/0x20 [ 1194.458038][ T0] secondary_startup_64_no_verify+0xb0/0xbb IO completion can be queued to a different CPU by the block subsystem as a "call single function/data". The CPU may run these routines from the idle task, but it does so with interrupts disabled. It is not a good idea to do decryption with irqs disabled even in an idle task context, so just defer it to a tasklet (as is done with requests from hard irqs). Fixes: 39d42fa96ba1 ("dm crypt: add flags to optionally bypass kcryptd workqueues") Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Ignat Korchagin Signed-off-by: Mike Snitzer --- drivers/md/dm-crypt.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index b4d7418b5036..8c874710f0bc 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -2221,8 +2221,12 @@ static void kcryptd_queue_crypt(struct dm_crypt_io *io) if ((bio_data_dir(io->base_bio) == READ && test_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags)) || (bio_data_dir(io->base_bio) == WRITE && test_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags))) { - if (in_irq()) { - /* Crypto API's "skcipher_walk_first() refuses to work in hard IRQ context */ + /* + * in_irq(): Crypto API's skcipher_walk_first() refuses to work in hard IRQ context. + * irqs_disabled(): the kernel may run some IO completion from the idle thread, but + * it is being executed with irqs disabled. + */ + if (in_irq() || irqs_disabled()) { tasklet_init(&io->tasklet, kcryptd_crypt_tasklet, (unsigned long)&io->work); tasklet_schedule(&io->tasklet); return; From 55ed4560774d81d7343223b8fd2784c530a9c6c1 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 9 Dec 2020 14:27:44 +0900 Subject: [PATCH 130/241] tools/bootconfig: Add tracing_on support to helper scripts Add ftrace.instance.INSTANCE.tracing_on support to ftrace2bconf.sh and bconf2ftrace.sh. commit 8490db06f914 ("tracing/boot: Add per-instance tracing_on option support") added the per-instance tracing_on option, but forgot to update the helper scripts. Link: https://lkml.kernel.org/r/160749166410.3497930.14204335886811029800.stgit@devnote2 Cc: stable@vger.kernel.org Fixes: 8490db06f914 ("tracing/boot: Add per-instance tracing_on option support") Signed-off-by: Masami Hiramatsu Signed-off-by: Steven Rostedt (VMware) --- tools/bootconfig/scripts/bconf2ftrace.sh | 1 + tools/bootconfig/scripts/ftrace2bconf.sh | 4 ++++ 2 files changed, 5 insertions(+) diff --git a/tools/bootconfig/scripts/bconf2ftrace.sh b/tools/bootconfig/scripts/bconf2ftrace.sh index 595e164dc352..feb30c2c7881 100755 --- a/tools/bootconfig/scripts/bconf2ftrace.sh +++ b/tools/bootconfig/scripts/bconf2ftrace.sh @@ -152,6 +152,7 @@ setup_instance() { # [instance] set_array_of ${instance}.options ${instancedir}/trace_options set_value_of ${instance}.trace_clock ${instancedir}/trace_clock set_value_of ${instance}.cpumask ${instancedir}/tracing_cpumask + set_value_of ${instance}.tracing_on ${instancedir}/tracing_on set_value_of ${instance}.tracer ${instancedir}/current_tracer set_array_of ${instance}.ftrace.filters \ ${instancedir}/set_ftrace_filter diff --git a/tools/bootconfig/scripts/ftrace2bconf.sh b/tools/bootconfig/scripts/ftrace2bconf.sh index 6c0d4b61e0c2..a0c3bcc6da4f 100755 --- a/tools/bootconfig/scripts/ftrace2bconf.sh +++ b/tools/bootconfig/scripts/ftrace2bconf.sh @@ -221,6 +221,10 @@ instance_options() { # [instance-name] if [ `echo $val | sed -e s/f//g`x != x ]; then emit_kv $PREFIX.cpumask = $val fi + val=`cat $INSTANCE/tracing_on` + if [ `echo $val | sed -e s/f//g`x != x ]; then + emit_kv $PREFIX.tracing_on = $val + fi val= for i in `cat $INSTANCE/set_event`; do From b79f2dc5ffe17b03ec8c55f0d63f65e87bcac676 Mon Sep 17 00:00:00 2001 From: Aharon Landau Date: Wed, 13 Jan 2021 14:16:59 +0200 Subject: [PATCH 131/241] RDMA/umem: Avoid undefined behavior of rounddown_pow_of_two() rounddown_pow_of_two() is undefined when the input is 0. Therefore we need to avoid it in ib_umem_find_best_pgsz and return 0. Otherwise, it could result in not rejecting an invalid page size which eventually causes a kernel oops due to the logical inconsistency. Fixes: 3361c29e9279 ("RDMA/umem: Use simpler logic for ib_umem_find_best_pgsz()") Link: https://lore.kernel.org/r/20210113121703.559778-2-leon@kernel.org Signed-off-by: Aharon Landau Reviewed-by: Jason Gunthorpe Reviewed-by: Maor Gottlieb Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/umem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 7ca4112e3e8f..917338db7ac1 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -135,7 +135,7 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, */ if (mask) pgsz_bitmap &= GENMASK(count_trailing_zeros(mask), 0); - return rounddown_pow_of_two(pgsz_bitmap); + return pgsz_bitmap ? rounddown_pow_of_two(pgsz_bitmap) : 0; } EXPORT_SYMBOL(ib_umem_find_best_pgsz); From 2cb091f6293df898b47f4e0f2e54324e2bbaf816 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 13 Jan 2021 14:17:00 +0200 Subject: [PATCH 132/241] IB/mlx5: Fix error unwinding when set_has_smi_cap fails When set_has_smi_cap() fails, multiport master cleanup is missed. Fix it by doing the correct error unwinding goto. Fixes: a989ea01cb10 ("RDMA/mlx5: Move SMI caps logic") Link: https://lore.kernel.org/r/20210113121703.559778-3-leon@kernel.org Signed-off-by: Parav Pandit Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index 3bae9ba0ead8..fbe3b75f866b 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -3956,7 +3956,7 @@ static int mlx5_ib_stage_init_init(struct mlx5_ib_dev *dev) err = set_has_smi_cap(dev); if (err) - return err; + goto err_mp; if (!mlx5_core_mp_enabled(mdev)) { for (i = 1; i <= dev->num_ports; i++) { From 1c3aa6bd0b823105c2030af85d92d158e815d669 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Wed, 13 Jan 2021 14:17:03 +0200 Subject: [PATCH 133/241] RDMA/mlx5: Fix wrong free of blue flame register on error If the allocation of the fast path blue flame register fails, the driver should free the regular blue flame register allocated a statement above, not the one that it just failed to allocate. Fixes: 16c1975f1032 ("IB/mlx5: Create profile infrastructure to add and remove stages") Link: https://lore.kernel.org/r/20210113121703.559778-6-leon@kernel.org Reported-by: Hans Petter Selasky Signed-off-by: Mark Bloch Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/hw/mlx5/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index fbe3b75f866b..d26f3f3e0462 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -4319,7 +4319,7 @@ static int mlx5_ib_stage_bfrag_init(struct mlx5_ib_dev *dev) err = mlx5_alloc_bfreg(dev->mdev, &dev->fp_bfreg, false, true); if (err) - mlx5_free_bfreg(dev->mdev, &dev->fp_bfreg); + mlx5_free_bfreg(dev->mdev, &dev->bfreg); return err; } From 7c7b3e5d9aeed31d35c5dab0bf9c0fd4c8923206 Mon Sep 17 00:00:00 2001 From: Neta Ostrovsky Date: Wed, 13 Jan 2021 15:02:14 +0200 Subject: [PATCH 134/241] RDMA/cma: Fix error flow in default_roce_mode_store In default_roce_mode_store(), we took a reference to cma_dev, but didn't return it with cma_dev_put in the error flow. Fixes: 1c15b4f2a42f ("RDMA/core: Modify enum ib_gid_type and enum rdma_network_type") Link: https://lore.kernel.org/r/20210113130214.562108-1-leon@kernel.org Signed-off-by: Neta Ostrovsky Signed-off-by: Leon Romanovsky Signed-off-by: Jason Gunthorpe --- drivers/infiniband/core/cma_configfs.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c index 7f70e5a7de10..97a77ea8d3c9 100644 --- a/drivers/infiniband/core/cma_configfs.c +++ b/drivers/infiniband/core/cma_configfs.c @@ -131,8 +131,10 @@ static ssize_t default_roce_mode_store(struct config_item *item, return ret; gid_type = ib_cache_gid_parse_type_str(buf); - if (gid_type < 0) + if (gid_type < 0) { + cma_configfs_params_put(cma_dev); return -EINVAL; + } ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); From 4369376ba91c97a1b2dd74abeec18c0c0ddf4ac9 Mon Sep 17 00:00:00 2001 From: Likun Gao Date: Fri, 8 Jan 2021 16:34:31 +0800 Subject: [PATCH 135/241] drm/amdgpu: set power brake sequence Add function to set power brake sequence. Signed-off-by: Likun Gao Reviewed-by: Kenneth Feng Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 32 ++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 34141b785aa4..619d34c041ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -164,6 +164,9 @@ #define mmGCVM_L2_CGTT_CLK_CTRL_Sienna_Cichlid 0x15db #define mmGCVM_L2_CGTT_CLK_CTRL_Sienna_Cichlid_BASE_IDX 0 +#define mmGC_THROTTLE_CTRL_Sienna_Cichlid 0x2030 +#define mmGC_THROTTLE_CTRL_Sienna_Cichlid_BASE_IDX 0 + MODULE_FIRMWARE("amdgpu/navi10_ce.bin"); MODULE_FIRMWARE("amdgpu/navi10_pfp.bin"); MODULE_FIRMWARE("amdgpu/navi10_me.bin"); @@ -3328,6 +3331,7 @@ static void gfx_v10_0_ring_emit_de_meta(struct amdgpu_ring *ring, bool resume); static void gfx_v10_0_ring_emit_frame_cntl(struct amdgpu_ring *ring, bool start, bool secure); static u32 gfx_v10_3_get_disabled_sa(struct amdgpu_device *adev); static void gfx_v10_3_program_pbb_mode(struct amdgpu_device *adev); +static void gfx_v10_3_set_power_brake_sequence(struct amdgpu_device *adev); static void gfx10_kiq_set_resources(struct amdgpu_ring *kiq_ring, uint64_t queue_mask) { @@ -7196,6 +7200,9 @@ static int gfx_v10_0_hw_init(void *handle) if (adev->asic_type == CHIP_SIENNA_CICHLID) gfx_v10_3_program_pbb_mode(adev); + if (adev->asic_type >= CHIP_SIENNA_CICHLID) + gfx_v10_3_set_power_brake_sequence(adev); + return r; } @@ -9181,6 +9188,31 @@ static void gfx_v10_3_program_pbb_mode(struct amdgpu_device *adev) } } +static void gfx_v10_3_set_power_brake_sequence(struct amdgpu_device *adev) +{ + WREG32_SOC15(GC, 0, mmGRBM_GFX_INDEX, + (0x1 << GRBM_GFX_INDEX__SA_BROADCAST_WRITES__SHIFT) | + (0x1 << GRBM_GFX_INDEX__INSTANCE_BROADCAST_WRITES__SHIFT) | + (0x1 << GRBM_GFX_INDEX__SE_BROADCAST_WRITES__SHIFT)); + + WREG32_SOC15(GC, 0, mmGC_CAC_IND_INDEX, ixPWRBRK_STALL_PATTERN_CTRL); + WREG32_SOC15(GC, 0, mmGC_CAC_IND_DATA, + (0x1 << PWRBRK_STALL_PATTERN_CTRL__PWRBRK_STEP_INTERVAL__SHIFT) | + (0x12 << PWRBRK_STALL_PATTERN_CTRL__PWRBRK_BEGIN_STEP__SHIFT) | + (0x13 << PWRBRK_STALL_PATTERN_CTRL__PWRBRK_END_STEP__SHIFT) | + (0xf << PWRBRK_STALL_PATTERN_CTRL__PWRBRK_THROTTLE_PATTERN_BIT_NUMS__SHIFT)); + + WREG32_SOC15(GC, 0, mmGC_THROTTLE_CTRL_Sienna_Cichlid, + (0x1 << GC_THROTTLE_CTRL__PWRBRK_STALL_EN__SHIFT) | + (0x1 << GC_THROTTLE_CTRL__PATTERN_MODE__SHIFT) | + (0x5 << GC_THROTTLE_CTRL__RELEASE_STEP_INTERVAL__SHIFT)); + + WREG32_SOC15(GC, 0, mmDIDT_IND_INDEX, ixDIDT_SQ_THROTTLE_CTRL); + + WREG32_SOC15(GC, 0, mmDIDT_IND_DATA, + (0x1 << DIDT_SQ_THROTTLE_CTRL__PWRBRK_STALL_EN__SHIFT)); +} + const struct amdgpu_ip_block_version gfx_v10_0_ip_block = { .type = AMD_IP_BLOCK_TYPE_GFX, From 55df908bd663ead7d85bd64dd49562d5ac3889ef Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 7 Jan 2021 14:50:51 -0500 Subject: [PATCH 136/241] Revert "drm/amd/display: Fix unused variable warning" This reverts commit f01afd1ee48816457fb22e201f1d0cfb14589904. Cc: Wayne Lin Cc: Alexander Deucher Cc: Harry Wentland Cc: Roman Li Cc: Bindu R Cc: Daniel Vetter Acked-by: Daniel Vetter Acked-by: Alex Deucher Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 4 +++- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 146486071d01..8fac6116591b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -8378,7 +8378,8 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) acrtc->dm_irq_params.stream = dm_new_crtc_state->stream; manage_dm_interrupts(adev, acrtc, true); } - if (IS_ENABLED(CONFIG_DEBUG_FS) && new_crtc_state->active && +#ifdef CONFIG_DEBUG_FS + if (new_crtc_state->active && amdgpu_dm_is_valid_crc_source(dm_new_crtc_state->crc_src)) { /** * Frontend may have changed so reapply the CRC capture @@ -8399,6 +8400,7 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) amdgpu_dm_crtc_configure_crc_source( crtc, dm_new_crtc_state, dm_new_crtc_state->crc_src); } +#endif } for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h index eba2f1d35d07..0235bfb246e5 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h @@ -46,13 +46,13 @@ static inline bool amdgpu_dm_is_valid_crc_source(enum amdgpu_dm_pipe_crc_source } /* amdgpu_dm_crc.c */ +#ifdef CONFIG_DEBUG_FS bool amdgpu_dm_crc_window_is_default(struct dm_crtc_state *dm_crtc_state); bool amdgpu_dm_crc_window_changed(struct dm_crtc_state *dm_new_crtc_state, struct dm_crtc_state *dm_old_crtc_state); int amdgpu_dm_crtc_configure_crc_source(struct drm_crtc *crtc, struct dm_crtc_state *dm_crtc_state, enum amdgpu_dm_pipe_crc_source source); -#ifdef CONFIG_DEBUG_FS int amdgpu_dm_crtc_set_crc_source(struct drm_crtc *crtc, const char *src_name); int amdgpu_dm_crtc_verify_crc_source(struct drm_crtc *crtc, const char *src_name, From 3c517ca5212faab4604e1725b4d31e290945ff87 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 7 Jan 2021 14:51:49 -0500 Subject: [PATCH 137/241] Revert "drm/amdgpu/disply: fix documentation warnings in display manager" This reverts commit 6ae09fa49147e557eb6aebbb5b2059b63706d454. Cc: Wayne Lin Cc: Alexander Deucher Cc: Harry Wentland Cc: Roman Li Cc: Bindu R Cc: Daniel Vetter Acked-by: Daniel Vetter Acked-by: Alex Deucher Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 21 +------------------ 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 2ee6edb3df93..0b31779a0485 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -337,29 +337,10 @@ struct amdgpu_display_manager { const struct gpu_info_soc_bounding_box_v1_0 *soc_bounding_box; #ifdef CONFIG_DEBUG_FS - /** - * @crc_win_x_start_property: - * - * X start of the crc calculation window - */ + /* set the crc calculation window*/ struct drm_property *crc_win_x_start_property; - /** - * @crc_win_y_start_property: - * - * Y start of the crc calculation window - */ struct drm_property *crc_win_y_start_property; - /** - * @crc_win_x_end_property: - * - * X end of the crc calculation window - */ struct drm_property *crc_win_x_end_property; - /** - * @crc_win_y_end_property: - * - * Y end of the crc calculation window - */ struct drm_property *crc_win_y_end_property; #endif /** From a7ddd22151fc2910c7b2faad64680cc2bb699b03 Mon Sep 17 00:00:00 2001 From: Rodrigo Siqueira Date: Thu, 7 Jan 2021 15:09:30 -0500 Subject: [PATCH 138/241] Revert "drm/amd/display: Expose new CRC window property" This reverts commit c920888c604d72799d057bbcd9e28a6c003ccfbe. Cc: Wayne Lin Cc: Alexander Deucher Cc: Harry Wentland Cc: Roman Li Cc: Bindu R Cc: Daniel Vetter Acked-by: Daniel Vetter Acked-by: Alex Deucher Reviewed-by: Wayne Lin Signed-off-by: Rodrigo Siqueira Signed-off-by: Alex Deucher --- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 142 +----------------- .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 19 --- .../drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c | 56 +------ .../drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h | 3 - 4 files changed, 10 insertions(+), 210 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 8fac6116591b..c6da89df055d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -939,41 +939,6 @@ static void mmhub_read_system_context(struct amdgpu_device *adev, struct dc_phy_ } #endif -#ifdef CONFIG_DEBUG_FS -static int create_crtc_crc_properties(struct amdgpu_display_manager *dm) -{ - dm->crc_win_x_start_property = - drm_property_create_range(adev_to_drm(dm->adev), - DRM_MODE_PROP_ATOMIC, - "AMD_CRC_WIN_X_START", 0, U16_MAX); - if (!dm->crc_win_x_start_property) - return -ENOMEM; - - dm->crc_win_y_start_property = - drm_property_create_range(adev_to_drm(dm->adev), - DRM_MODE_PROP_ATOMIC, - "AMD_CRC_WIN_Y_START", 0, U16_MAX); - if (!dm->crc_win_y_start_property) - return -ENOMEM; - - dm->crc_win_x_end_property = - drm_property_create_range(adev_to_drm(dm->adev), - DRM_MODE_PROP_ATOMIC, - "AMD_CRC_WIN_X_END", 0, U16_MAX); - if (!dm->crc_win_x_end_property) - return -ENOMEM; - - dm->crc_win_y_end_property = - drm_property_create_range(adev_to_drm(dm->adev), - DRM_MODE_PROP_ATOMIC, - "AMD_CRC_WIN_Y_END", 0, U16_MAX); - if (!dm->crc_win_y_end_property) - return -ENOMEM; - - return 0; -} -#endif - static int amdgpu_dm_init(struct amdgpu_device *adev) { struct dc_init_data init_data; @@ -1120,10 +1085,6 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) dc_init_callbacks(adev->dm.dc, &init_params); } -#endif -#ifdef CONFIG_DEBUG_FS - if (create_crtc_crc_properties(&adev->dm)) - DRM_ERROR("amdgpu: failed to create crc property.\n"); #endif if (amdgpu_dm_initialize_drm_device(adev)) { DRM_ERROR( @@ -5333,64 +5294,12 @@ dm_crtc_duplicate_state(struct drm_crtc *crtc) state->crc_src = cur->crc_src; state->cm_has_degamma = cur->cm_has_degamma; state->cm_is_degamma_srgb = cur->cm_is_degamma_srgb; -#ifdef CONFIG_DEBUG_FS - state->crc_window = cur->crc_window; -#endif + /* TODO Duplicate dc_stream after objects are stream object is flattened */ return &state->base; } -#ifdef CONFIG_DEBUG_FS -static int amdgpu_dm_crtc_atomic_set_property(struct drm_crtc *crtc, - struct drm_crtc_state *crtc_state, - struct drm_property *property, - uint64_t val) -{ - struct drm_device *dev = crtc->dev; - struct amdgpu_device *adev = drm_to_adev(dev); - struct dm_crtc_state *dm_new_state = - to_dm_crtc_state(crtc_state); - - if (property == adev->dm.crc_win_x_start_property) - dm_new_state->crc_window.x_start = val; - else if (property == adev->dm.crc_win_y_start_property) - dm_new_state->crc_window.y_start = val; - else if (property == adev->dm.crc_win_x_end_property) - dm_new_state->crc_window.x_end = val; - else if (property == adev->dm.crc_win_y_end_property) - dm_new_state->crc_window.y_end = val; - else - return -EINVAL; - - return 0; -} - -static int amdgpu_dm_crtc_atomic_get_property(struct drm_crtc *crtc, - const struct drm_crtc_state *state, - struct drm_property *property, - uint64_t *val) -{ - struct drm_device *dev = crtc->dev; - struct amdgpu_device *adev = drm_to_adev(dev); - struct dm_crtc_state *dm_state = - to_dm_crtc_state(state); - - if (property == adev->dm.crc_win_x_start_property) - *val = dm_state->crc_window.x_start; - else if (property == adev->dm.crc_win_y_start_property) - *val = dm_state->crc_window.y_start; - else if (property == adev->dm.crc_win_x_end_property) - *val = dm_state->crc_window.x_end; - else if (property == adev->dm.crc_win_y_end_property) - *val = dm_state->crc_window.y_end; - else - return -EINVAL; - - return 0; -} -#endif - static inline int dm_set_vupdate_irq(struct drm_crtc *crtc, bool enable) { enum dc_irq_source irq_source; @@ -5457,10 +5366,6 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { .enable_vblank = dm_enable_vblank, .disable_vblank = dm_disable_vblank, .get_vblank_timestamp = drm_crtc_vblank_helper_get_vblank_timestamp, -#ifdef CONFIG_DEBUG_FS - .atomic_set_property = amdgpu_dm_crtc_atomic_set_property, - .atomic_get_property = amdgpu_dm_crtc_atomic_get_property, -#endif }; static enum drm_connector_status @@ -6662,25 +6567,6 @@ static int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, return 0; } -#ifdef CONFIG_DEBUG_FS -static void attach_crtc_crc_properties(struct amdgpu_display_manager *dm, - struct amdgpu_crtc *acrtc) -{ - drm_object_attach_property(&acrtc->base.base, - dm->crc_win_x_start_property, - 0); - drm_object_attach_property(&acrtc->base.base, - dm->crc_win_y_start_property, - 0); - drm_object_attach_property(&acrtc->base.base, - dm->crc_win_x_end_property, - 0); - drm_object_attach_property(&acrtc->base.base, - dm->crc_win_y_end_property, - 0); -} -#endif - static int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, struct drm_plane *plane, uint32_t crtc_index) @@ -6728,9 +6614,7 @@ static int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, drm_crtc_enable_color_mgmt(&acrtc->base, MAX_COLOR_LUT_ENTRIES, true, MAX_COLOR_LUT_ENTRIES); drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); -#ifdef CONFIG_DEBUG_FS - attach_crtc_crc_properties(dm, acrtc); -#endif + return 0; fail: @@ -8367,7 +8251,6 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) */ for_each_oldnew_crtc_in_state(state, crtc, old_crtc_state, new_crtc_state, i) { struct amdgpu_crtc *acrtc = to_amdgpu_crtc(crtc); - bool configure_crc = false; dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); @@ -8377,30 +8260,21 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) dc_stream_retain(dm_new_crtc_state->stream); acrtc->dm_irq_params.stream = dm_new_crtc_state->stream; manage_dm_interrupts(adev, acrtc, true); - } + #ifdef CONFIG_DEBUG_FS - if (new_crtc_state->active && - amdgpu_dm_is_valid_crc_source(dm_new_crtc_state->crc_src)) { /** * Frontend may have changed so reapply the CRC capture * settings for the stream. */ dm_new_crtc_state = to_dm_crtc_state(new_crtc_state); - dm_old_crtc_state = to_dm_crtc_state(old_crtc_state); - if (amdgpu_dm_crc_window_is_default(dm_new_crtc_state)) { - if (!old_crtc_state->active || drm_atomic_crtc_needs_modeset(new_crtc_state)) - configure_crc = true; - } else { - if (amdgpu_dm_crc_window_changed(dm_new_crtc_state, dm_old_crtc_state)) - configure_crc = true; - } - - if (configure_crc) + if (amdgpu_dm_is_valid_crc_source(dm_new_crtc_state->crc_src)) { amdgpu_dm_crtc_configure_crc_source( - crtc, dm_new_crtc_state, dm_new_crtc_state->crc_src); - } + crtc, dm_new_crtc_state, + dm_new_crtc_state->crc_src); + } #endif + } } for_each_new_crtc_in_state(state, crtc, new_crtc_state, j) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h index 0b31779a0485..1182dafcef02 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h @@ -336,13 +336,6 @@ struct amdgpu_display_manager { */ const struct gpu_info_soc_bounding_box_v1_0 *soc_bounding_box; -#ifdef CONFIG_DEBUG_FS - /* set the crc calculation window*/ - struct drm_property *crc_win_x_start_property; - struct drm_property *crc_win_y_start_property; - struct drm_property *crc_win_x_end_property; - struct drm_property *crc_win_y_end_property; -#endif /** * @mst_encoders: * @@ -429,15 +422,6 @@ struct dm_plane_state { struct dc_plane_state *dc_state; }; -#ifdef CONFIG_DEBUG_FS -struct crc_rec { - uint16_t x_start; - uint16_t y_start; - uint16_t x_end; - uint16_t y_end; - }; -#endif - struct dm_crtc_state { struct drm_crtc_state base; struct dc_stream_state *stream; @@ -460,9 +444,6 @@ struct dm_crtc_state { struct dc_info_packet vrr_infopacket; int abm_level; -#ifdef CONFIG_DEBUG_FS - struct crc_rec crc_window; -#endif }; #define to_dm_crtc_state(x) container_of(x, struct dm_crtc_state, base) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c index 7b886a779a8c..c29dc11619f7 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c @@ -81,41 +81,6 @@ const char *const *amdgpu_dm_crtc_get_crc_sources(struct drm_crtc *crtc, return pipe_crc_sources; } -static void amdgpu_dm_set_crc_window_default(struct dm_crtc_state *dm_crtc_state) -{ - dm_crtc_state->crc_window.x_start = 0; - dm_crtc_state->crc_window.y_start = 0; - dm_crtc_state->crc_window.x_end = 0; - dm_crtc_state->crc_window.y_end = 0; -} - -bool amdgpu_dm_crc_window_is_default(struct dm_crtc_state *dm_crtc_state) -{ - bool ret = true; - - if ((dm_crtc_state->crc_window.x_start != 0) || - (dm_crtc_state->crc_window.y_start != 0) || - (dm_crtc_state->crc_window.x_end != 0) || - (dm_crtc_state->crc_window.y_end != 0)) - ret = false; - - return ret; -} - -bool amdgpu_dm_crc_window_changed(struct dm_crtc_state *dm_new_crtc_state, - struct dm_crtc_state *dm_old_crtc_state) -{ - bool ret = false; - - if ((dm_new_crtc_state->crc_window.x_start != dm_old_crtc_state->crc_window.x_start) || - (dm_new_crtc_state->crc_window.y_start != dm_old_crtc_state->crc_window.y_start) || - (dm_new_crtc_state->crc_window.x_end != dm_old_crtc_state->crc_window.x_end) || - (dm_new_crtc_state->crc_window.y_end != dm_old_crtc_state->crc_window.y_end)) - ret = true; - - return ret; -} - int amdgpu_dm_crtc_verify_crc_source(struct drm_crtc *crtc, const char *src_name, size_t *values_cnt) @@ -140,7 +105,6 @@ int amdgpu_dm_crtc_configure_crc_source(struct drm_crtc *crtc, struct dc_stream_state *stream_state = dm_crtc_state->stream; bool enable = amdgpu_dm_is_valid_crc_source(source); int ret = 0; - struct crc_params *crc_window = NULL, tmp_window; /* Configuration will be deferred to stream enable. */ if (!stream_state) @@ -149,25 +113,9 @@ int amdgpu_dm_crtc_configure_crc_source(struct drm_crtc *crtc, mutex_lock(&adev->dm.dc_lock); /* Enable CRTC CRC generation if necessary. */ - if (dm_is_crc_source_crtc(source) || source == AMDGPU_DM_PIPE_CRC_SOURCE_NONE) { - if (!enable) - amdgpu_dm_set_crc_window_default(dm_crtc_state); - - if (!amdgpu_dm_crc_window_is_default(dm_crtc_state)) { - crc_window = &tmp_window; - - tmp_window.windowa_x_start = dm_crtc_state->crc_window.x_start; - tmp_window.windowa_y_start = dm_crtc_state->crc_window.y_start; - tmp_window.windowa_x_end = dm_crtc_state->crc_window.x_end; - tmp_window.windowa_y_end = dm_crtc_state->crc_window.y_end; - tmp_window.windowb_x_start = dm_crtc_state->crc_window.x_start; - tmp_window.windowb_y_start = dm_crtc_state->crc_window.y_start; - tmp_window.windowb_x_end = dm_crtc_state->crc_window.x_end; - tmp_window.windowb_y_end = dm_crtc_state->crc_window.y_end; - } - + if (dm_is_crc_source_crtc(source)) { if (!dc_stream_configure_crc(stream_state->ctx->dc, - stream_state, crc_window, enable, enable)) { + stream_state, NULL, enable, enable)) { ret = -EINVAL; goto unlock; } diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h index 0235bfb246e5..f7d731797d3f 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.h @@ -47,9 +47,6 @@ static inline bool amdgpu_dm_is_valid_crc_source(enum amdgpu_dm_pipe_crc_source /* amdgpu_dm_crc.c */ #ifdef CONFIG_DEBUG_FS -bool amdgpu_dm_crc_window_is_default(struct dm_crtc_state *dm_crtc_state); -bool amdgpu_dm_crc_window_changed(struct dm_crtc_state *dm_new_crtc_state, - struct dm_crtc_state *dm_old_crtc_state); int amdgpu_dm_crtc_configure_crc_source(struct drm_crtc *crtc, struct dm_crtc_state *dm_crtc_state, enum amdgpu_dm_pipe_crc_source source); From 2f0fa789f7b9fb022440f8f846cae175233987aa Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Tue, 24 Nov 2020 19:57:03 +0800 Subject: [PATCH 139/241] drm/amd/display: Fix to be able to stop crc calculation [Why] Find out when we try to disable CRC calculation, crc generation is still enabled. Main reason is that dc_stream_configure_crc() will never get called when the source is AMDGPU_DM_PIPE_CRC_SOURCE_NONE. [How] Add checking condition that when source is AMDGPU_DM_PIPE_CRC_SOURCE_NONE, we should also call dc_stream_configure_crc() to disable crc calculation. Acked-by: Alex Deucher Reviewed-by: Rodrigo Siqueira Signed-off-by: Wayne Lin Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c index c29dc11619f7..66cb8730586b 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crc.c @@ -113,7 +113,7 @@ int amdgpu_dm_crtc_configure_crc_source(struct drm_crtc *crtc, mutex_lock(&adev->dm.dc_lock); /* Enable CRTC CRC generation if necessary. */ - if (dm_is_crc_source_crtc(source)) { + if (dm_is_crc_source_crtc(source) || source == AMDGPU_DM_PIPE_CRC_SOURCE_NONE) { if (!dc_stream_configure_crc(stream_state->ctx->dc, stream_state, NULL, enable, enable)) { ret = -EINVAL; From 7a84665619bb5da8c8b6517157875a1fd7632014 Mon Sep 17 00:00:00 2001 From: Israel Rukshin Date: Sun, 10 Jan 2021 14:09:05 +0200 Subject: [PATCH 140/241] nvmet-rdma: Fix NULL deref when setting pi_enable and traddr INADDR_ANY When setting port traddr to INADDR_ANY, the listening cm_id->device is NULL. The associate IB device is known only when a connect request event arrives, so checking T10-PI device capability should be done at this stage. Fixes: b09160c3996c ("nvmet-rdma: add metadata/T10-PI support") Signed-off-by: Israel Rukshin Reviewed-by: Sagi Grimberg Reviewed-by: Max Gurtovoy Signed-off-by: Christoph Hellwig --- drivers/nvme/target/rdma.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index bdfc22eb2a10..06b6b742bb21 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1220,6 +1220,14 @@ nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) } ndev->inline_data_size = nport->inline_data_size; ndev->inline_page_count = inline_page_count; + + if (nport->pi_enable && !(cm_id->device->attrs.device_cap_flags & + IB_DEVICE_INTEGRITY_HANDOVER)) { + pr_warn("T10-PI is not supported by device %s. Disabling it\n", + cm_id->device->name); + nport->pi_enable = false; + } + ndev->device = cm_id->device; kref_init(&ndev->ref); @@ -1855,14 +1863,6 @@ static int nvmet_rdma_enable_port(struct nvmet_rdma_port *port) goto out_destroy_id; } - if (port->nport->pi_enable && - !(cm_id->device->attrs.device_cap_flags & - IB_DEVICE_INTEGRITY_HANDOVER)) { - pr_err("T10-PI is not supported for %pISpcs\n", addr); - ret = -EINVAL; - goto out_destroy_id; - } - port->cm_id = cm_id; return 0; From ada831772188192243f9ea437c46e37e97a5975d Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Jan 2021 14:03:04 -0800 Subject: [PATCH 141/241] nvme-tcp: Fix warning with CONFIG_DEBUG_PREEMPT We shouldn't call smp_processor_id() in a preemptible context, but this is advisory at best, so instead call __smp_processor_id(). Fixes: db5ad6b7f8cd ("nvme-tcp: try to send request in queue_rq context") Reported-by: Or Gerlitz Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 979ee31b8dd1..b2e0865785ef 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -286,7 +286,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req, * directly, otherwise queue io_work. Also, only do that if we * are on the same cpu, so we don't introduce contention. */ - if (queue->io_cpu == smp_processor_id() && + if (queue->io_cpu == __smp_processor_id() && sync && empty && mutex_trylock(&queue->send_mutex)) { queue->more_requests = !last; nvme_tcp_send_all(queue); From ca1ff67d0fb14f39cf0cc5102b1fbcc3b14f6fb9 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Jan 2021 13:56:57 -0800 Subject: [PATCH 142/241] nvme-tcp: fix possible data corruption with bio merges When a bio merges, we can get a request that spans multiple bios, and the overall request payload size is the sum of all bios. When we calculate how much we need to send from the existing bio (and bvec), we did not take into account the iov_iter byte count cap. Since multipage bvecs support, bvecs can split in the middle which means that when we account for the last bvec send we should also take the iov_iter byte count cap as it might be lower than the last bvec size. Reported-by: Hao Wang Fixes: 3f2304f8c6d6 ("nvme-tcp: add NVMe over TCP host driver") Tested-by: Hao Wang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index b2e0865785ef..216619926563 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -201,7 +201,7 @@ static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req) static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req) { - return min_t(size_t, req->iter.bvec->bv_len - req->iter.iov_offset, + return min_t(size_t, iov_iter_single_seg_count(&req->iter), req->pdu_len - req->pdu_sent); } From 5ab25a32cd90ce561ac28b9302766e565d61304c Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Wed, 13 Jan 2021 16:00:22 -0800 Subject: [PATCH 143/241] nvme: don't intialize hwmon for discovery controllers Discovery controllers usually don't support smart log page command. So when we connect to the discovery controller we see this warning: nvme nvme0: Failed to read smart log (error 24577) nvme nvme0: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery", addr 192.168.123.1:8009 nvme nvme0: Removing ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery" Introduce a new helper to understand if the controller is a discovery controller and use this helper to skip nvme_init_hwmon (also use it in other places that we check if the controller is a discovery controller). Fixes: 400b6a7b13a3 ("nvme: Add hardware monitoring support") Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index f320273fc672..200bdd672c28 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2856,6 +2856,11 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = { NULL, }; +static inline bool nvme_discovery_ctrl(struct nvme_ctrl *ctrl) +{ + return ctrl->opts && ctrl->opts->discovery_nqn; +} + static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { @@ -2875,7 +2880,7 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys, } if ((id->cmic & NVME_CTRL_CMIC_MULTI_CTRL) || - (ctrl->opts && ctrl->opts->discovery_nqn)) + nvme_discovery_ctrl(ctrl)) continue; dev_err(ctrl->device, @@ -3144,7 +3149,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) goto out_free; } - if (!ctrl->opts->discovery_nqn && !ctrl->kas) { + if (!nvme_discovery_ctrl(ctrl) && !ctrl->kas) { dev_err(ctrl->device, "keep-alive support is mandatory for fabrics\n"); ret = -EINVAL; @@ -3184,7 +3189,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) if (ret < 0) return ret; - if (!ctrl->identified) { + if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) { ret = nvme_hwmon_init(ctrl); if (ret < 0) return ret; From c2083e280a3d4f71941c9c57992d4e621e4e33c5 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Thu, 14 Jan 2021 09:04:49 +0100 Subject: [PATCH 144/241] cfg80211: fix a kerneldoc markup A function has a different name between their prototype and its kernel-doc markup: ../include/net/cfg80211.h:1766: warning: expecting prototype for struct cfg80211_sar_chan_ranges. Prototype was for struct cfg80211_sar_freq_ranges instead Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/c7ed4bc4d9e992ead16d3d2df246f3b56dbfb1fb.1610610937.git.mchehab+huawei@kernel.org Signed-off-by: Johannes Berg --- include/net/cfg80211.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 1b3954afcda4..0d6f7ec86061 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1756,7 +1756,7 @@ struct cfg80211_sar_specs { /** - * struct cfg80211_sar_chan_ranges - sar frequency ranges + * struct cfg80211_sar_freq_ranges - sar frequency ranges * @start_freq: start range edge frequency * @end_freq: end range edge frequency */ From 6020d534fa012b80c6d13811dc4d2dfedca2e403 Mon Sep 17 00:00:00 2001 From: Shayne Chen Date: Tue, 12 Jan 2021 11:20:28 +0800 Subject: [PATCH 145/241] mac80211: fix incorrect strlen of .write in debugfs This fixes strlen mismatch problems happening in some .write callbacks of debugfs. When trying to configure airtime_flags in debugfs, an error appeared: ash: write error: Invalid argument The error is returned from kstrtou16() since a wrong length makes it miss the real end of input string. To fix this, use count as the string length, and set proper end of string for a char buffer. The debug print is shown - airtime_flags_write: count = 2, len = 8, where the actual length is 2, but "len = strlen(buf)" gets 8. Also cleanup the other similar cases for the sake of consistency. Signed-off-by: Sujuan Chen Signed-off-by: Ryder Lee Signed-off-by: Shayne Chen Link: https://lore.kernel.org/r/20210112032028.7482-1-shayne.chen@mediatek.com Signed-off-by: Johannes Berg --- net/mac80211/debugfs.c | 44 +++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 48f144f107d5..9e723d943421 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -120,18 +120,17 @@ static ssize_t aqm_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len-1] == '\n') - buf[len-1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "fq_limit %u", &local->fq.limit) == 1) return count; @@ -177,18 +176,17 @@ static ssize_t airtime_flags_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[16]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (kstrtou16(buf, 0, &local->airtime_flags)) return -EINVAL; @@ -237,20 +235,19 @@ static ssize_t aql_txq_limit_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[100]; - size_t len; u32 ac, q_limit_low, q_limit_high, q_limit_low_old, q_limit_high_old; struct sta_info *sta; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = 0; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (sscanf(buf, "%u %u %u", &ac, &q_limit_low, &q_limit_high) != 3) return -EINVAL; @@ -306,18 +303,17 @@ static ssize_t force_tx_status_write(struct file *file, { struct ieee80211_local *local = file->private_data; char buf[3]; - size_t len; - if (count > sizeof(buf)) + if (count >= sizeof(buf)) return -EINVAL; if (copy_from_user(buf, user_buf, count)) return -EFAULT; - buf[sizeof(buf) - 1] = '\0'; - len = strlen(buf); - if (len > 0 && buf[len - 1] == '\n') - buf[len - 1] = 0; + if (count && buf[count - 1] == '\n') + buf[count - 1] = '\0'; + else + buf[count] = '\0'; if (buf[0] == '0' && buf[1] == '\0') local->force_tx_status = 0; From 622d3b4e39381262da7b18ca1ed1311df227de86 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:17 +0100 Subject: [PATCH 146/241] mac80211: fix fast-rx encryption check When using WEP, the default unicast key needs to be selected, instead of the STA PTK. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-5-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/rx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 13b9bcc4865d..972895e9f22d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -4176,6 +4176,8 @@ void ieee80211_check_fast_rx(struct sta_info *sta) rcu_read_lock(); key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); if (key) { switch (key->conf.cipher) { case WLAN_CIPHER_SUITE_TKIP: From b101dd2d22f45d203010b40c739df346a0cbebef Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 19:47:16 +0100 Subject: [PATCH 147/241] mac80211: fix encryption key selection for 802.3 xmit When using WEP, the default unicast key needs to be selected, instead of the STA PTK. Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218184718.93650-4-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 6422da6690f7..2d606dbab37c 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -4251,7 +4251,6 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct ethhdr *ehdr = (struct ethhdr *)skb->data; struct ieee80211_key *key; struct sta_info *sta; - bool offload = true; if (unlikely(skb->len < ETH_HLEN)) { kfree_skb(skb); @@ -4267,18 +4266,22 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, if (unlikely(IS_ERR_OR_NULL(sta) || !sta->uploaded || !test_sta_flag(sta, WLAN_STA_AUTHORIZED) || - sdata->control_port_protocol == ehdr->h_proto)) - offload = false; - else if ((key = rcu_dereference(sta->ptk[sta->ptk_idx])) && - (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || - key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) - offload = false; + sdata->control_port_protocol == ehdr->h_proto)) + goto skip_offload; - if (offload) - ieee80211_8023_xmit(sdata, dev, sta, key, skb); - else - ieee80211_subif_start_xmit(skb, dev); + key = rcu_dereference(sta->ptk[sta->ptk_idx]); + if (!key) + key = rcu_dereference(sdata->default_unicast_key); + if (key && (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) || + key->conf.cipher == WLAN_CIPHER_SUITE_TKIP)) + goto skip_offload; + + ieee80211_8023_xmit(sdata, dev, sta, key, skb); + goto out; + +skip_offload: + ieee80211_subif_start_xmit(skb, dev); out: rcu_read_unlock(); From 2463ec86cd0338a2c2edbfb0b9d50c52ff76ff43 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 18 Dec 2020 20:15:25 +0100 Subject: [PATCH 148/241] mac80211: do not drop tx nulldata packets on encrypted links ieee80211_tx_h_select_key drops any non-mgmt packets without a key when encryption is used. This is wrong for nulldata packets that can't be encrypted and are sent out for probing clients and indicating 4-address mode. Reported-by: Sebastian Gottschall Fixes: a0761a301746 ("mac80211: drop data frames without key on encrypted links") Signed-off-by: Felix Fietkau Link: https://lore.kernel.org/r/20201218191525.1168-1-nbd@nbd.name Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 2d606dbab37c..de8325c79dd5 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -649,7 +649,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (!skip_hw && tx->key && tx->key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE) info->control.hw_key = &tx->key->conf; - } else if (!ieee80211_is_mgmt(hdr->frame_control) && tx->sta && + } else if (ieee80211_is_data_present(hdr->frame_control) && tx->sta && test_sta_flag(tx->sta, WLAN_STA_USES_ENCRYPTION)) { return TX_DROP; } From c13cf5c159660451c8fbdc37efb998b198e1d305 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sat, 26 Dec 2020 10:39:08 +0100 Subject: [PATCH 149/241] mac80211: check if atf has been disabled in __ieee80211_schedule_txq MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Check if atf has been disabled in __ieee80211_schedule_txq() in order to avoid a given sta is always put to the beginning of the active_txqs list and never moved to the end since deficit is not decremented in ieee80211_sta_register_airtime() Fixes: b4809e9484da1 ("mac80211: Add airtime accounting and scheduling to TXQs") Signed-off-by: Lorenzo Bianconi Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/r/93889406c50f1416214c079ca0b8c9faecc5143e.1608975195.git.lorenzo@kernel.org Signed-off-by: Johannes Berg --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index de8325c79dd5..ebb3228ce971 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -3809,7 +3809,7 @@ void __ieee80211_schedule_txq(struct ieee80211_hw *hw, * get immediately moved to the back of the list on the next * call to ieee80211_next_txq(). */ - if (txqi->txq.sta && + if (txqi->txq.sta && local->airtime_flags && wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AIRTIME_FAIRNESS)) list_add(&txqi->schedule_order, From 402a89660e9dc880710b12773076a336c9dab3d7 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 150/241] drm/nouveau/bios: fix issue shadowing expansion ROMs This issue has generally been covered up by the presence of additional expansion ROMs after the ones we're interested in, with header fetches of subsequent images loading enough of the ROM to hide the issue. Noticed on GA102, which lacks a type 0x70 image compared to TU102,. [ 906.364197] nouveau 0000:09:00.0: bios: 00000000: type 00, 65024 bytes [ 906.381205] nouveau 0000:09:00.0: bios: 0000fe00: type 03, 91648 bytes [ 906.405213] nouveau 0000:09:00.0: bios: 00026400: type e0, 22016 bytes [ 906.410984] nouveau 0000:09:00.0: bios: 0002ba00: type e0, 366080 bytes vs [ 22.961901] nouveau 0000:09:00.0: bios: 00000000: type 00, 60416 bytes [ 22.984174] nouveau 0000:09:00.0: bios: 0000ec00: type 03, 71168 bytes [ 23.010446] nouveau 0000:09:00.0: bios: 00020200: type e0, 48128 bytes [ 23.028220] nouveau 0000:09:00.0: bios: 0002be00: type e0, 140800 bytes [ 23.080196] nouveau 0000:09:00.0: bios: 0004e400: type 70, 7168 bytes Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c index 7deb81b6dbac..4b571cc6bc70 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadow.c @@ -75,7 +75,7 @@ shadow_image(struct nvkm_bios *bios, int idx, u32 offset, struct shadow *mthd) nvkm_debug(subdev, "%08x: type %02x, %d bytes\n", image.base, image.type, image.size); - if (!shadow_fetch(bios, mthd, image.size)) { + if (!shadow_fetch(bios, mthd, image.base + image.size)) { nvkm_debug(subdev, "%08x: fetch failed\n", image.base); return 0; } From e05e06cd34f5311f677294a08b609acfbc315236 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 151/241] drm/nouveau/privring: ack interrupts the same way as RM Whatever it is that we were doing before doesn't work on Ampere. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gf100.c | 10 +++++++--- drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gk104.c | 10 +++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gf100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gf100.c index 2340040942c9..1115376bc85f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gf100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gf100.c @@ -22,6 +22,7 @@ * Authors: Ben Skeggs */ #include "priv.h" +#include static void gf100_ibus_intr_hub(struct nvkm_subdev *ibus, int i) @@ -31,7 +32,6 @@ gf100_ibus_intr_hub(struct nvkm_subdev *ibus, int i) u32 data = nvkm_rd32(device, 0x122124 + (i * 0x0400)); u32 stat = nvkm_rd32(device, 0x122128 + (i * 0x0400)); nvkm_debug(ibus, "HUB%d: %06x %08x (%08x)\n", i, addr, data, stat); - nvkm_mask(device, 0x122128 + (i * 0x0400), 0x00000200, 0x00000000); } static void @@ -42,7 +42,6 @@ gf100_ibus_intr_rop(struct nvkm_subdev *ibus, int i) u32 data = nvkm_rd32(device, 0x124124 + (i * 0x0400)); u32 stat = nvkm_rd32(device, 0x124128 + (i * 0x0400)); nvkm_debug(ibus, "ROP%d: %06x %08x (%08x)\n", i, addr, data, stat); - nvkm_mask(device, 0x124128 + (i * 0x0400), 0x00000200, 0x00000000); } static void @@ -53,7 +52,6 @@ gf100_ibus_intr_gpc(struct nvkm_subdev *ibus, int i) u32 data = nvkm_rd32(device, 0x128124 + (i * 0x0400)); u32 stat = nvkm_rd32(device, 0x128128 + (i * 0x0400)); nvkm_debug(ibus, "GPC%d: %06x %08x (%08x)\n", i, addr, data, stat); - nvkm_mask(device, 0x128128 + (i * 0x0400), 0x00000200, 0x00000000); } void @@ -90,6 +88,12 @@ gf100_ibus_intr(struct nvkm_subdev *ibus) intr1 &= ~stat; } } + + nvkm_mask(device, 0x121c4c, 0x0000003f, 0x00000002); + nvkm_msec(device, 2000, + if (!(nvkm_rd32(device, 0x121c4c) & 0x0000003f)) + break; + ); } static int diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gk104.c b/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gk104.c index f3915f85838e..22e487b493ad 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gk104.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/ibus/gk104.c @@ -22,6 +22,7 @@ * Authors: Ben Skeggs */ #include "priv.h" +#include static void gk104_ibus_intr_hub(struct nvkm_subdev *ibus, int i) @@ -31,7 +32,6 @@ gk104_ibus_intr_hub(struct nvkm_subdev *ibus, int i) u32 data = nvkm_rd32(device, 0x122124 + (i * 0x0800)); u32 stat = nvkm_rd32(device, 0x122128 + (i * 0x0800)); nvkm_debug(ibus, "HUB%d: %06x %08x (%08x)\n", i, addr, data, stat); - nvkm_mask(device, 0x122128 + (i * 0x0800), 0x00000200, 0x00000000); } static void @@ -42,7 +42,6 @@ gk104_ibus_intr_rop(struct nvkm_subdev *ibus, int i) u32 data = nvkm_rd32(device, 0x124124 + (i * 0x0800)); u32 stat = nvkm_rd32(device, 0x124128 + (i * 0x0800)); nvkm_debug(ibus, "ROP%d: %06x %08x (%08x)\n", i, addr, data, stat); - nvkm_mask(device, 0x124128 + (i * 0x0800), 0x00000200, 0x00000000); } static void @@ -53,7 +52,6 @@ gk104_ibus_intr_gpc(struct nvkm_subdev *ibus, int i) u32 data = nvkm_rd32(device, 0x128124 + (i * 0x0800)); u32 stat = nvkm_rd32(device, 0x128128 + (i * 0x0800)); nvkm_debug(ibus, "GPC%d: %06x %08x (%08x)\n", i, addr, data, stat); - nvkm_mask(device, 0x128128 + (i * 0x0800), 0x00000200, 0x00000000); } void @@ -90,6 +88,12 @@ gk104_ibus_intr(struct nvkm_subdev *ibus) intr1 &= ~stat; } } + + nvkm_mask(device, 0x12004c, 0x0000003f, 0x00000002); + nvkm_msec(device, 2000, + if (!(nvkm_rd32(device, 0x12004c) & 0x0000003f)) + break; + ); } static int From b5510d1e21d80e2fa2286468ca8c2922f5895ef8 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 152/241] drm/nouveau/i2c/gk110: split out from i2c/gk104 No functional changes here yet. Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/include/nvkm/subdev/i2c.h | 1 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 12 +++--- .../gpu/drm/nouveau/nvkm/subdev/i2c/Kbuild | 1 + .../gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c | 38 +++++++++++++++++++ 4 files changed, 46 insertions(+), 6 deletions(-) create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h index 81b977319640..640f649ce497 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/i2c.h @@ -92,6 +92,7 @@ int g94_i2c_new(struct nvkm_device *, int, struct nvkm_i2c **); int gf117_i2c_new(struct nvkm_device *, int, struct nvkm_i2c **); int gf119_i2c_new(struct nvkm_device *, int, struct nvkm_i2c **); int gk104_i2c_new(struct nvkm_device *, int, struct nvkm_i2c **); +int gk110_i2c_new(struct nvkm_device *, int, struct nvkm_i2c **); int gm200_i2c_new(struct nvkm_device *, int, struct nvkm_i2c **); static inline int diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 7851bec5f0e5..ffada13c416c 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -1815,7 +1815,7 @@ nvf0_chipset = { .fb = gk110_fb_new, .fuse = gf100_fuse_new, .gpio = gk104_gpio_new, - .i2c = gk104_i2c_new, + .i2c = gk110_i2c_new, .ibus = gk104_ibus_new, .iccsense = gf100_iccsense_new, .imem = nv50_instmem_new, @@ -1853,7 +1853,7 @@ nvf1_chipset = { .fb = gk110_fb_new, .fuse = gf100_fuse_new, .gpio = gk104_gpio_new, - .i2c = gk104_i2c_new, + .i2c = gk110_i2c_new, .ibus = gk104_ibus_new, .iccsense = gf100_iccsense_new, .imem = nv50_instmem_new, @@ -1891,7 +1891,7 @@ nv106_chipset = { .fb = gk110_fb_new, .fuse = gf100_fuse_new, .gpio = gk104_gpio_new, - .i2c = gk104_i2c_new, + .i2c = gk110_i2c_new, .ibus = gk104_ibus_new, .iccsense = gf100_iccsense_new, .imem = nv50_instmem_new, @@ -1929,7 +1929,7 @@ nv108_chipset = { .fb = gk110_fb_new, .fuse = gf100_fuse_new, .gpio = gk104_gpio_new, - .i2c = gk104_i2c_new, + .i2c = gk110_i2c_new, .ibus = gk104_ibus_new, .iccsense = gf100_iccsense_new, .imem = nv50_instmem_new, @@ -1967,7 +1967,7 @@ nv117_chipset = { .fb = gm107_fb_new, .fuse = gm107_fuse_new, .gpio = gk104_gpio_new, - .i2c = gk104_i2c_new, + .i2c = gk110_i2c_new, .ibus = gk104_ibus_new, .iccsense = gf100_iccsense_new, .imem = nv50_instmem_new, @@ -2003,7 +2003,7 @@ nv118_chipset = { .fb = gm107_fb_new, .fuse = gm107_fuse_new, .gpio = gk104_gpio_new, - .i2c = gk104_i2c_new, + .i2c = gk110_i2c_new, .ibus = gk104_ibus_new, .iccsense = gf100_iccsense_new, .imem = nv50_instmem_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/Kbuild index 723d0284caef..819703913a00 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/Kbuild @@ -7,6 +7,7 @@ nvkm-y += nvkm/subdev/i2c/g94.o nvkm-y += nvkm/subdev/i2c/gf117.o nvkm-y += nvkm/subdev/i2c/gf119.o nvkm-y += nvkm/subdev/i2c/gk104.o +nvkm-y += nvkm/subdev/i2c/gk110.o nvkm-y += nvkm/subdev/i2c/gm200.o nvkm-y += nvkm/subdev/i2c/pad.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c new file mode 100644 index 000000000000..10eaf1d70f62 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c @@ -0,0 +1,38 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "priv.h" +#include "pad.h" + +static const struct nvkm_i2c_func +gk110_i2c = { + .pad_x_new = gf119_i2c_pad_x_new, + .pad_s_new = gf119_i2c_pad_s_new, + .aux = 4, + .aux_stat = gk104_aux_stat, + .aux_mask = gk104_aux_mask, +}; + +int +gk110_i2c_new(struct nvkm_device *device, int index, struct nvkm_i2c **pi2c) +{ + return nvkm_i2c_new_(&gk110_i2c, device, index, pi2c); +} From 8ad95edc39100c22c29ab1d2588332b99f387c8e Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 153/241] drm/nouveau/i2c/gk110-: disable hw-initiated dpcd reads RM does this around transactions, and it seemed to help while debugging AUXCH issues on GA102. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.h | 7 +++++++ drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxg94.c | 10 +++++++--- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c | 9 +++++++-- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c | 7 +++++++ drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gm200.c | 7 +++++++ drivers/gpu/drm/nouveau/nvkm/subdev/i2c/pad.h | 2 +- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/priv.h | 4 ++++ 7 files changed, 40 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.h b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.h index 30b48896965e..f920eabf8628 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/aux.h @@ -3,6 +3,13 @@ #define __NVKM_I2C_AUX_H__ #include "pad.h" +static inline void +nvkm_i2c_aux_autodpcd(struct nvkm_i2c *i2c, int aux, bool enable) +{ + if (i2c->func->aux_autodpcd) + i2c->func->aux_autodpcd(i2c, aux, false); +} + struct nvkm_i2c_aux_func { bool address_only; int (*xfer)(struct nvkm_i2c_aux *, bool retry, u8 type, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxg94.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxg94.c index db7769cb33eb..47068f6f9c55 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxg94.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxg94.c @@ -77,7 +77,8 @@ g94_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, u8 type, u32 addr, u8 *data, u8 *size) { struct g94_i2c_aux *aux = g94_i2c_aux(obj); - struct nvkm_device *device = aux->base.pad->i2c->subdev.device; + struct nvkm_i2c *i2c = aux->base.pad->i2c; + struct nvkm_device *device = i2c->subdev.device; const u32 base = aux->ch * 0x50; u32 ctrl, stat, timeout, retries = 0; u32 xbuf[4] = {}; @@ -96,6 +97,8 @@ g94_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, goto out; } + nvkm_i2c_aux_autodpcd(i2c, aux->ch, false); + if (!(type & 1)) { memcpy(xbuf, data, *size); for (i = 0; i < 16; i += 4) { @@ -128,7 +131,7 @@ g94_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, if (!timeout--) { AUX_ERR(&aux->base, "timeout %08x", ctrl); ret = -EIO; - goto out; + goto out_err; } } while (ctrl & 0x00010000); ret = 0; @@ -154,7 +157,8 @@ g94_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, memcpy(data, xbuf, *size); *size = stat & 0x0000001f; } - +out_err: + nvkm_i2c_aux_autodpcd(i2c, aux->ch, true); out: g94_i2c_aux_fini(aux); return ret < 0 ? ret : (stat & 0x000f0000) >> 16; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c index edb6148cbca0..ab67b67fd2a5 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c @@ -77,7 +77,8 @@ gm200_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, u8 type, u32 addr, u8 *data, u8 *size) { struct gm200_i2c_aux *aux = gm200_i2c_aux(obj); - struct nvkm_device *device = aux->base.pad->i2c->subdev.device; + struct nvkm_i2c *i2c = aux->base.pad->i2c; + struct nvkm_device *device = i2c->subdev.device; const u32 base = aux->ch * 0x50; u32 ctrl, stat, timeout, retries = 0; u32 xbuf[4] = {}; @@ -96,6 +97,8 @@ gm200_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, goto out; } + nvkm_i2c_aux_autodpcd(i2c, aux->ch, false); + if (!(type & 1)) { memcpy(xbuf, data, *size); for (i = 0; i < 16; i += 4) { @@ -128,7 +131,7 @@ gm200_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, if (!timeout--) { AUX_ERR(&aux->base, "timeout %08x", ctrl); ret = -EIO; - goto out; + goto out_err; } } while (ctrl & 0x00010000); ret = 0; @@ -155,6 +158,8 @@ gm200_i2c_aux_xfer(struct nvkm_i2c_aux *obj, bool retry, *size = stat & 0x0000001f; } +out_err: + nvkm_i2c_aux_autodpcd(i2c, aux->ch, true); out: gm200_i2c_aux_fini(aux); return ret < 0 ? ret : (stat & 0x000f0000) >> 16; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c index 10eaf1d70f62..8e3bfa1af52a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gk110.c @@ -22,6 +22,12 @@ #include "priv.h" #include "pad.h" +static void +gk110_aux_autodpcd(struct nvkm_i2c *i2c, int aux, bool enable) +{ + nvkm_mask(i2c->subdev.device, 0x00e4f8 + (aux * 0x50), 0x00010000, enable << 16); +} + static const struct nvkm_i2c_func gk110_i2c = { .pad_x_new = gf119_i2c_pad_x_new, @@ -29,6 +35,7 @@ gk110_i2c = { .aux = 4, .aux_stat = gk104_aux_stat, .aux_mask = gk104_aux_mask, + .aux_autodpcd = gk110_aux_autodpcd, }; int diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gm200.c index a23c5f315221..7b2375bff8a9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/gm200.c @@ -24,6 +24,12 @@ #include "priv.h" #include "pad.h" +static void +gm200_aux_autodpcd(struct nvkm_i2c *i2c, int aux, bool enable) +{ + nvkm_mask(i2c->subdev.device, 0x00d968 + (aux * 0x50), 0x00010000, enable << 16); +} + static const struct nvkm_i2c_func gm200_i2c = { .pad_x_new = gf119_i2c_pad_x_new, @@ -31,6 +37,7 @@ gm200_i2c = { .aux = 8, .aux_stat = gk104_aux_stat, .aux_mask = gk104_aux_mask, + .aux_autodpcd = gm200_aux_autodpcd, }; int diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/pad.h b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/pad.h index 461016814f4f..44b7bb7d4777 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/pad.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/pad.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: MIT */ #ifndef __NVKM_I2C_PAD_H__ #define __NVKM_I2C_PAD_H__ -#include +#include "priv.h" struct nvkm_i2c_pad { const struct nvkm_i2c_pad_func *func; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/priv.h index bd86bc298ebe..e35f6036fcfc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/priv.h @@ -23,6 +23,10 @@ struct nvkm_i2c_func { /* mask on/off interrupt types for a given set of auxch */ void (*aux_mask)(struct nvkm_i2c *, u32, u32, u32); + + /* enable/disable HW-initiated DPCD reads + */ + void (*aux_autodpcd)(struct nvkm_i2c *, int aux, bool enable); }; void g94_aux_stat(struct nvkm_i2c *, u32 *, u32 *, u32 *, u32 *); From ba6e9ab0fcf3d76e3952deb12b5f993991621d9c Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 154/241] drm/nouveau/i2c/gm200: increase width of aux semaphore owner fields Noticed while debugging GA102. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c index ab67b67fd2a5..8bd1d442e465 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/i2c/auxgm200.c @@ -33,7 +33,7 @@ static void gm200_i2c_aux_fini(struct gm200_i2c_aux *aux) { struct nvkm_device *device = aux->base.pad->i2c->subdev.device; - nvkm_mask(device, 0x00d954 + (aux->ch * 0x50), 0x00310000, 0x00000000); + nvkm_mask(device, 0x00d954 + (aux->ch * 0x50), 0x00710000, 0x00000000); } static int @@ -54,10 +54,10 @@ gm200_i2c_aux_init(struct gm200_i2c_aux *aux) AUX_ERR(&aux->base, "begin idle timeout %08x", ctrl); return -EBUSY; } - } while (ctrl & 0x03010000); + } while (ctrl & 0x07010000); /* set some magic, and wait up to 1ms for it to appear */ - nvkm_mask(device, 0x00d954 + (aux->ch * 0x50), 0x00300000, ureq); + nvkm_mask(device, 0x00d954 + (aux->ch * 0x50), 0x00700000, ureq); timeout = 1000; do { ctrl = nvkm_rd32(device, 0x00d954 + (aux->ch * 0x50)); @@ -67,7 +67,7 @@ gm200_i2c_aux_init(struct gm200_i2c_aux *aux) gm200_i2c_aux_fini(aux); return -EBUSY; } - } while ((ctrl & 0x03000000) != urep); + } while ((ctrl & 0x07000000) != urep); return 0; } From add42781ad76c5ae65127bf13852a4c6b2f08849 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 155/241] drm/nouveau/mmu: fix vram heap sizing Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c index de91e9a26172..6d5212ae2fd5 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mmu/base.c @@ -316,9 +316,9 @@ nvkm_mmu_vram(struct nvkm_mmu *mmu) { struct nvkm_device *device = mmu->subdev.device; struct nvkm_mm *mm = &device->fb->ram->vram; - const u32 sizeN = nvkm_mm_heap_size(mm, NVKM_RAM_MM_NORMAL); - const u32 sizeU = nvkm_mm_heap_size(mm, NVKM_RAM_MM_NOMAP); - const u32 sizeM = nvkm_mm_heap_size(mm, NVKM_RAM_MM_MIXED); + const u64 sizeN = nvkm_mm_heap_size(mm, NVKM_RAM_MM_NORMAL); + const u64 sizeU = nvkm_mm_heap_size(mm, NVKM_RAM_MM_NOMAP); + const u64 sizeM = nvkm_mm_heap_size(mm, NVKM_RAM_MM_MIXED); u8 type = NVKM_MEM_KIND * !!mmu->func->kind; u8 heap = NVKM_MEM_VRAM; int heapM, heapN, heapU; From 3b050680c84153d8e6f5ae3785922cd417f4b071 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 156/241] drm/nouveau/core: recognise GA10[024] GA100 hidden behind a module option, as it's not been as well verified since initial bring-up and may need additional changes. There's no display anyway, so this can wait for a bit. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/include/nvif/cl0080.h | 1 + .../drm/nouveau/include/nvkm/core/device.h | 1 + drivers/gpu/drm/nouveau/nouveau_backlight.c | 1 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 35 +++++++++++++++++-- .../gpu/drm/nouveau/nvkm/engine/device/user.c | 1 + 5 files changed, 36 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/include/nvif/cl0080.h b/drivers/gpu/drm/nouveau/include/nvif/cl0080.h index cd9a2e687bb6..57d4f457a7d4 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/cl0080.h +++ b/drivers/gpu/drm/nouveau/include/nvif/cl0080.h @@ -33,6 +33,7 @@ struct nv_device_info_v0 { #define NV_DEVICE_INFO_V0_PASCAL 0x0a #define NV_DEVICE_INFO_V0_VOLTA 0x0b #define NV_DEVICE_INFO_V0_TURING 0x0c +#define NV_DEVICE_INFO_V0_AMPERE 0x0d __u8 family; __u8 pad06[2]; __u64 ram_size; diff --git a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h index 5c007ce62fc3..c920939a1467 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/core/device.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/core/device.h @@ -120,6 +120,7 @@ struct nvkm_device { GP100 = 0x130, GV100 = 0x140, TU100 = 0x160, + GA100 = 0x170, } card_type; u32 chipset; u8 chiprev; diff --git a/drivers/gpu/drm/nouveau/nouveau_backlight.c b/drivers/gpu/drm/nouveau/nouveau_backlight.c index c7a94c94dbf3..72f35a2babcb 100644 --- a/drivers/gpu/drm/nouveau/nouveau_backlight.c +++ b/drivers/gpu/drm/nouveau/nouveau_backlight.c @@ -256,6 +256,7 @@ nouveau_backlight_init(struct drm_connector *connector) case NV_DEVICE_INFO_V0_PASCAL: case NV_DEVICE_INFO_V0_VOLTA: case NV_DEVICE_INFO_V0_TURING: + case NV_DEVICE_INFO_V0_AMPERE: //XXX: not confirmed ret = nv50_backlight_init(nv_encoder, &props, &ops); break; default: diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index ffada13c416c..b6b094bbd562 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2652,6 +2652,21 @@ nv168_chipset = { .sec2 = tu102_sec2_new, }; +static const struct nvkm_device_chip +nv170_chipset = { + .name = "GA100", +}; + +static const struct nvkm_device_chip +nv172_chipset = { + .name = "GA102", +}; + +static const struct nvkm_device_chip +nv174_chipset = { + .name = "GA104", +}; + static int nvkm_device_event_ctor(struct nvkm_object *object, void *data, u32 size, struct nvkm_notify *notify) @@ -3063,6 +3078,7 @@ nvkm_device_ctor(const struct nvkm_device_func *func, case 0x130: device->card_type = GP100; break; case 0x140: device->card_type = GV100; break; case 0x160: device->card_type = TU100; break; + case 0x170: device->card_type = GA100; break; default: break; } @@ -3160,10 +3176,23 @@ nvkm_device_ctor(const struct nvkm_device_func *func, case 0x166: device->chip = &nv166_chipset; break; case 0x167: device->chip = &nv167_chipset; break; case 0x168: device->chip = &nv168_chipset; break; + case 0x172: device->chip = &nv172_chipset; break; + case 0x174: device->chip = &nv174_chipset; break; default: - nvdev_error(device, "unknown chipset (%08x)\n", boot0); - ret = -ENODEV; - goto done; + if (nvkm_boolopt(device->cfgopt, "NvEnableUnsupportedChipsets", false)) { + switch (device->chipset) { + case 0x170: device->chip = &nv170_chipset; break; + default: + break; + } + } + + if (!device->chip) { + nvdev_error(device, "unknown chipset (%08x)\n", boot0); + ret = -ENODEV; + goto done; + } + break; } nvdev_info(device, "NVIDIA %s (%08x)\n", diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c index 03c6d9aef075..147894798786 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/user.c @@ -176,6 +176,7 @@ nvkm_udevice_info(struct nvkm_udevice *udev, void *data, u32 size) case GP100: args->v0.family = NV_DEVICE_INFO_V0_PASCAL; break; case GV100: args->v0.family = NV_DEVICE_INFO_V0_VOLTA; break; case TU100: args->v0.family = NV_DEVICE_INFO_V0_TURING; break; + case GA100: args->v0.family = NV_DEVICE_INFO_V0_AMPERE; break; default: args->v0.family = 0; break; From caeb6ab899c3d36a74cda6e299c6e1c9c4e2a22e Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 157/241] drm/nouveau/kms/nv50-: fix case where notifier buffer is at offset 0 VRAM offset 0 is a valid address, triggered on GA102. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/disp.c | 4 ++-- drivers/gpu/drm/nouveau/dispnv50/disp.h | 2 +- drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index 33fff388dd83..c6367035970e 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -222,7 +222,7 @@ nv50_dmac_wait(struct nvif_push *push, u32 size) int nv50_dmac_create(struct nvif_device *device, struct nvif_object *disp, - const s32 *oclass, u8 head, void *data, u32 size, u64 syncbuf, + const s32 *oclass, u8 head, void *data, u32 size, s64 syncbuf, struct nv50_dmac *dmac) { struct nouveau_cli *cli = (void *)device->object.client; @@ -271,7 +271,7 @@ nv50_dmac_create(struct nvif_device *device, struct nvif_object *disp, if (ret) return ret; - if (!syncbuf) + if (syncbuf < 0) return 0; ret = nvif_object_ctor(&dmac->base.user, "kmsSyncCtxDma", NV50_DISP_HANDLE_SYNCBUF, diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.h b/drivers/gpu/drm/nouveau/dispnv50/disp.h index 92bddc083617..38dec11e7dda 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.h +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.h @@ -95,7 +95,7 @@ struct nv50_outp_atom { int nv50_dmac_create(struct nvif_device *device, struct nvif_object *disp, const s32 *oclass, u8 head, void *data, u32 size, - u64 syncbuf, struct nv50_dmac *dmac); + s64 syncbuf, struct nv50_dmac *dmac); void nv50_dmac_destroy(struct nv50_dmac *); /* diff --git a/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c b/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c index 685b70871324..b390029c69ec 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wimmc37b.c @@ -76,7 +76,7 @@ wimmc37b_init_(const struct nv50_wimm_func *func, struct nouveau_drm *drm, int ret; ret = nv50_dmac_create(&drm->client.device, &disp->disp->object, - &oclass, 0, &args, sizeof(args), 0, + &oclass, 0, &args, sizeof(args), -1, &wndw->wimm); if (ret) { NV_ERROR(drm, "wimm%04x allocation failed: %d\n", oclass, ret); From 70afbe4bdc0a7ccdb462a38216f5abc3db7e5c1b Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 158/241] drm/nouveau/pci/ga10[024]: initial support Appears to be compatible with GP100 code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index b6b094bbd562..95c470d13cb9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2655,16 +2655,19 @@ nv168_chipset = { static const struct nvkm_device_chip nv170_chipset = { .name = "GA100", + .pci = gp100_pci_new, }; static const struct nvkm_device_chip nv172_chipset = { .name = "GA102", + .pci = gp100_pci_new, }; static const struct nvkm_device_chip nv174_chipset = { .name = "GA104", + .pci = gp100_pci_new, }; static int From a34632482f1ea768429a9d4c79a10d12f5093405 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 159/241] drm/nouveau/bios/ga10[024]: initial support Forcing PRAMIN-shadowing off for GA100, as it requires display, and we don't know if/where the fuse register for detecting its presence is. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowramin.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 95c470d13cb9..7c35c32cdf73 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2655,18 +2655,21 @@ nv168_chipset = { static const struct nvkm_device_chip nv170_chipset = { .name = "GA100", + .bios = nvkm_bios_new, .pci = gp100_pci_new, }; static const struct nvkm_device_chip nv172_chipset = { .name = "GA102", + .bios = nvkm_bios_new, .pci = gp100_pci_new, }; static const struct nvkm_device_chip nv174_chipset = { .name = "GA104", + .bios = nvkm_bios_new, .pci = gp100_pci_new, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowramin.c b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowramin.c index 3634cd0630b8..023ddc7c5399 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowramin.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/bios/shadowramin.c @@ -64,6 +64,9 @@ pramin_init(struct nvkm_bios *bios, const char *name) return NULL; /* we can't get the bios image pointer without PDISP */ + if (device->card_type >= GA100) + addr = device->chipset == 0x170; /*XXX: find the fuse reg for this */ + else if (device->card_type >= GM100) addr = nvkm_rd32(device, 0x021c04); else From 7ddf5e9597faa6f939370e294e0f6d9516d2a431 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 160/241] drm/nouveau/devinit/ga10[024]: initial support VPLL regs changed a bit. There's more stuff to do around these, but it's less invasive to stick those changes into disp for now. None of that belongs here anymore anyhow - fix that someday. Signed-off-by: Ben Skeggs --- .../drm/nouveau/include/nvkm/subdev/devinit.h | 1 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 3 + .../drm/nouveau/nvkm/subdev/devinit/Kbuild | 1 + .../drm/nouveau/nvkm/subdev/devinit/ga100.c | 76 +++++++++++++++++++ .../drm/nouveau/nvkm/subdev/devinit/priv.h | 1 + .../drm/nouveau/nvkm/subdev/devinit/tu102.c | 2 +- 6 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/devinit/ga100.c diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/devinit.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/devinit.h index 1a39e52e09e3..50cc7c05eac4 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/devinit.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/devinit.h @@ -32,4 +32,5 @@ int gm107_devinit_new(struct nvkm_device *, int, struct nvkm_devinit **); int gm200_devinit_new(struct nvkm_device *, int, struct nvkm_devinit **); int gv100_devinit_new(struct nvkm_device *, int, struct nvkm_devinit **); int tu102_devinit_new(struct nvkm_device *, int, struct nvkm_devinit **); +int ga100_devinit_new(struct nvkm_device *, int, struct nvkm_devinit **); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 7c35c32cdf73..76f1b2504f3a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2656,6 +2656,7 @@ static const struct nvkm_device_chip nv170_chipset = { .name = "GA100", .bios = nvkm_bios_new, + .devinit = ga100_devinit_new, .pci = gp100_pci_new, }; @@ -2663,6 +2664,7 @@ static const struct nvkm_device_chip nv172_chipset = { .name = "GA102", .bios = nvkm_bios_new, + .devinit = ga100_devinit_new, .pci = gp100_pci_new, }; @@ -2670,6 +2672,7 @@ static const struct nvkm_device_chip nv174_chipset = { .name = "GA104", .bios = nvkm_bios_new, + .devinit = ga100_devinit_new, .pci = gp100_pci_new, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/Kbuild index b3429371ed82..d1abb64841da 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/Kbuild @@ -15,3 +15,4 @@ nvkm-y += nvkm/subdev/devinit/gm107.o nvkm-y += nvkm/subdev/devinit/gm200.o nvkm-y += nvkm/subdev/devinit/gv100.o nvkm-y += nvkm/subdev/devinit/tu102.o +nvkm-y += nvkm/subdev/devinit/ga100.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/ga100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/ga100.c new file mode 100644 index 000000000000..636a92128f6c --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/ga100.c @@ -0,0 +1,76 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "nv50.h" + +#include +#include +#include + +static int +ga100_devinit_pll_set(struct nvkm_devinit *init, u32 type, u32 freq) +{ + struct nvkm_subdev *subdev = &init->subdev; + struct nvkm_device *device = subdev->device; + struct nvbios_pll info; + int head = type - PLL_VPLL0; + int N, fN, M, P; + int ret; + + ret = nvbios_pll_parse(device->bios, type, &info); + if (ret) + return ret; + + ret = gt215_pll_calc(subdev, &info, freq, &N, &fN, &M, &P); + if (ret < 0) + return ret; + + switch (info.type) { + case PLL_VPLL0: + case PLL_VPLL1: + case PLL_VPLL2: + case PLL_VPLL3: + nvkm_wr32(device, 0x00ef00 + (head * 0x40), 0x02080004); + nvkm_wr32(device, 0x00ef18 + (head * 0x40), (N << 16) | fN); + nvkm_wr32(device, 0x00ef04 + (head * 0x40), (P << 16) | M); + nvkm_wr32(device, 0x00e9c0 + (head * 0x04), 0x00000001); + break; + default: + nvkm_warn(subdev, "%08x/%dKhz unimplemented\n", type, freq); + ret = -EINVAL; + break; + } + + return ret; +} + +static const struct nvkm_devinit_func +ga100_devinit = { + .init = nv50_devinit_init, + .post = tu102_devinit_post, + .pll_set = ga100_devinit_pll_set, +}; + +int +ga100_devinit_new(struct nvkm_device *device, int index, struct nvkm_devinit **pinit) +{ + return nv50_devinit_new_(&ga100_devinit, device, index, pinit); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/priv.h index 94723352137a..05961e624264 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/priv.h @@ -19,4 +19,5 @@ void nvkm_devinit_ctor(const struct nvkm_devinit_func *, struct nvkm_device *, int index, struct nvkm_devinit *); int nv04_devinit_post(struct nvkm_devinit *, bool); +int tu102_devinit_post(struct nvkm_devinit *, bool); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c index 397670e72fff..9a469bf482f2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/tu102.c @@ -65,7 +65,7 @@ tu102_devinit_pll_set(struct nvkm_devinit *init, u32 type, u32 freq) return ret; } -static int +int tu102_devinit_post(struct nvkm_devinit *base, bool post) { struct nv50_devinit *init = nv50_devinit(base); From 5961c62d20753009408df4752e22991097386aa9 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 161/241] drm/nouveau/mc/ga10[024]: initial support Fortunately, all the interrupts we need to bring up basic display support are contained in a single leaf register, allowing this basic (but hackish) implementation. There's a bunch more invasive patches to come implementing all this in a better/more complete way, but trying to get a minimal series out first. Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/include/nvkm/subdev/mc.h | 1 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 3 + drivers/gpu/drm/nouveau/nvkm/subdev/mc/Kbuild | 1 + .../gpu/drm/nouveau/nvkm/subdev/mc/ga100.c | 74 +++++++++++++++++++ 4 files changed, 79 insertions(+) create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/mc/ga100.c diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mc.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mc.h index 6641fe4c252c..e45ca4583967 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/mc.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/mc.h @@ -32,4 +32,5 @@ int gk20a_mc_new(struct nvkm_device *, int, struct nvkm_mc **); int gp100_mc_new(struct nvkm_device *, int, struct nvkm_mc **); int gp10b_mc_new(struct nvkm_device *, int, struct nvkm_mc **); int tu102_mc_new(struct nvkm_device *, int, struct nvkm_mc **); +int ga100_mc_new(struct nvkm_device *, int, struct nvkm_mc **); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 76f1b2504f3a..fb551edc6a35 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2657,6 +2657,7 @@ nv170_chipset = { .name = "GA100", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .mc = ga100_mc_new, .pci = gp100_pci_new, }; @@ -2665,6 +2666,7 @@ nv172_chipset = { .name = "GA102", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .mc = ga100_mc_new, .pci = gp100_pci_new, }; @@ -2673,6 +2675,7 @@ nv174_chipset = { .name = "GA104", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .mc = ga100_mc_new, .pci = gp100_pci_new, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/Kbuild index 2585ef07532a..ac2b34e9ac6a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/Kbuild @@ -14,3 +14,4 @@ nvkm-y += nvkm/subdev/mc/gk20a.o nvkm-y += nvkm/subdev/mc/gp100.o nvkm-y += nvkm/subdev/mc/gp10b.o nvkm-y += nvkm/subdev/mc/tu102.o +nvkm-y += nvkm/subdev/mc/ga100.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/mc/ga100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/ga100.c new file mode 100644 index 000000000000..967eb3af11eb --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/mc/ga100.c @@ -0,0 +1,74 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "priv.h" + +static void +ga100_mc_intr_unarm(struct nvkm_mc *mc) +{ + nvkm_wr32(mc->subdev.device, 0xb81610, 0x00000004); +} + +static void +ga100_mc_intr_rearm(struct nvkm_mc *mc) +{ + nvkm_wr32(mc->subdev.device, 0xb81608, 0x00000004); +} + +static void +ga100_mc_intr_mask(struct nvkm_mc *mc, u32 mask, u32 intr) +{ + nvkm_wr32(mc->subdev.device, 0xb81210, mask & intr ); + nvkm_wr32(mc->subdev.device, 0xb81410, mask & ~(mask & intr)); +} + +static u32 +ga100_mc_intr_stat(struct nvkm_mc *mc) +{ + u32 intr_top = nvkm_rd32(mc->subdev.device, 0xb81600), intr = 0x00000000; + if (intr_top & 0x00000004) + intr = nvkm_mask(mc->subdev.device, 0xb81010, 0x00000000, 0x00000000); + return intr; +} + +static void +ga100_mc_init(struct nvkm_mc *mc) +{ + nv50_mc_init(mc); + nvkm_wr32(mc->subdev.device, 0xb81210, 0xffffffff); +} + +static const struct nvkm_mc_func +ga100_mc = { + .init = ga100_mc_init, + .intr = gp100_mc_intr, + .intr_unarm = ga100_mc_intr_unarm, + .intr_rearm = ga100_mc_intr_rearm, + .intr_mask = ga100_mc_intr_mask, + .intr_stat = ga100_mc_intr_stat, + .reset = gk104_mc_reset, +}; + +int +ga100_mc_new(struct nvkm_device *device, int index, struct nvkm_mc **pmc) +{ + return nvkm_mc_new_(&ga100_mc, device, index, pmc); +} From e0df4bbfc3365d7699e32bebb24647dc7a09b00c Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 162/241] drm/nouveau/privring/ga10[024]: initial support Appears to be compatible with GM200 code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index fb551edc6a35..2f6e1b1ce0bb 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2657,6 +2657,7 @@ nv170_chipset = { .name = "GA100", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .ibus = gm200_ibus_new, .mc = ga100_mc_new, .pci = gp100_pci_new, }; @@ -2666,6 +2667,7 @@ nv172_chipset = { .name = "GA102", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .ibus = gm200_ibus_new, .mc = ga100_mc_new, .pci = gp100_pci_new, }; @@ -2675,6 +2677,7 @@ nv174_chipset = { .name = "GA104", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .ibus = gm200_ibus_new, .mc = ga100_mc_new, .pci = gp100_pci_new, }; From de4781d0f22b54fdbe7ac459eb67b585ca3ee430 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 163/241] drm/nouveau/imem/ga10[024]: initial support Appears to be compatible with NV50 code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 2f6e1b1ce0bb..d2be48fc5db1 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2658,6 +2658,7 @@ nv170_chipset = { .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .ibus = gm200_ibus_new, + .imem = nv50_instmem_new, .mc = ga100_mc_new, .pci = gp100_pci_new, }; @@ -2668,6 +2669,7 @@ nv172_chipset = { .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .ibus = gm200_ibus_new, + .imem = nv50_instmem_new, .mc = ga100_mc_new, .pci = gp100_pci_new, }; @@ -2678,6 +2680,7 @@ nv174_chipset = { .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .ibus = gm200_ibus_new, + .imem = nv50_instmem_new, .mc = ga100_mc_new, .pci = gp100_pci_new, }; From 41ba806f40a9a4c4f4c04a474bf368160f1baa2c Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 164/241] drm/nouveau/fb/ga10[024]: initial support No VPR scrub. GA102 and GA104 have a new VRAM size detection method. Signed-off-by: Ben Skeggs --- .../gpu/drm/nouveau/include/nvkm/subdev/fb.h | 2 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 3 ++ drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild | 3 ++ .../gpu/drm/nouveau/nvkm/subdev/fb/ga100.c | 40 +++++++++++++++++++ .../gpu/drm/nouveau/nvkm/subdev/fb/ga102.c | 40 +++++++++++++++++++ .../gpu/drm/nouveau/nvkm/subdev/fb/gv100.c | 2 +- drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h | 2 + drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h | 1 + .../gpu/drm/nouveau/nvkm/subdev/fb/ramga102.c | 40 +++++++++++++++++++ 9 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga100.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramga102.c diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h index 34b56b10218a..2ecd52aec1d1 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/fb.h @@ -86,6 +86,8 @@ int gp100_fb_new(struct nvkm_device *, int, struct nvkm_fb **); int gp102_fb_new(struct nvkm_device *, int, struct nvkm_fb **); int gp10b_fb_new(struct nvkm_device *, int, struct nvkm_fb **); int gv100_fb_new(struct nvkm_device *, int, struct nvkm_fb **); +int ga100_fb_new(struct nvkm_device *, int, struct nvkm_fb **); +int ga102_fb_new(struct nvkm_device *, int, struct nvkm_fb **); #include #include diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index d2be48fc5db1..4a5d43fe3f3a 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2657,6 +2657,7 @@ nv170_chipset = { .name = "GA100", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .fb = ga100_fb_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, @@ -2668,6 +2669,7 @@ nv172_chipset = { .name = "GA102", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .fb = ga102_fb_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, @@ -2679,6 +2681,7 @@ nv174_chipset = { .name = "GA104", .bios = nvkm_bios_new, .devinit = ga100_devinit_new, + .fb = ga102_fb_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild index 43a42159a3d0..5d0bab8ecb43 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/Kbuild @@ -32,6 +32,8 @@ nvkm-y += nvkm/subdev/fb/gp100.o nvkm-y += nvkm/subdev/fb/gp102.o nvkm-y += nvkm/subdev/fb/gp10b.o nvkm-y += nvkm/subdev/fb/gv100.o +nvkm-y += nvkm/subdev/fb/ga100.o +nvkm-y += nvkm/subdev/fb/ga102.o nvkm-y += nvkm/subdev/fb/ram.o nvkm-y += nvkm/subdev/fb/ramnv04.o @@ -52,6 +54,7 @@ nvkm-y += nvkm/subdev/fb/ramgk104.o nvkm-y += nvkm/subdev/fb/ramgm107.o nvkm-y += nvkm/subdev/fb/ramgm200.o nvkm-y += nvkm/subdev/fb/ramgp100.o +nvkm-y += nvkm/subdev/fb/ramga102.o nvkm-y += nvkm/subdev/fb/sddr2.o nvkm-y += nvkm/subdev/fb/sddr3.o nvkm-y += nvkm/subdev/fb/gddr3.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga100.c new file mode 100644 index 000000000000..bf82686851cd --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga100.c @@ -0,0 +1,40 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "gf100.h" +#include "ram.h" + +static const struct nvkm_fb_func +ga100_fb = { + .dtor = gf100_fb_dtor, + .oneinit = gf100_fb_oneinit, + .init = gp100_fb_init, + .init_page = gv100_fb_init_page, + .init_unkn = gp100_fb_init_unkn, + .ram_new = gp100_ram_new, + .default_bigpage = 16, +}; + +int +ga100_fb_new(struct nvkm_device *device, int index, struct nvkm_fb **pfb) +{ + return gp102_fb_new_(&ga100_fb, device, index, pfb); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c new file mode 100644 index 000000000000..bcecf84a6e67 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ga102.c @@ -0,0 +1,40 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "gf100.h" +#include "ram.h" + +static const struct nvkm_fb_func +ga102_fb = { + .dtor = gf100_fb_dtor, + .oneinit = gf100_fb_oneinit, + .init = gp100_fb_init, + .init_page = gv100_fb_init_page, + .init_unkn = gp100_fb_init_unkn, + .ram_new = ga102_ram_new, + .default_bigpage = 16, +}; + +int +ga102_fb_new(struct nvkm_device *device, int index, struct nvkm_fb **pfb) +{ + return gp102_fb_new_(&ga102_fb, device, index, pfb); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c index 10ff5d053f7e..feda86a5fba8 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/gv100.c @@ -22,7 +22,7 @@ #include "gf100.h" #include "ram.h" -static int +int gv100_fb_init_page(struct nvkm_fb *fb) { return (fb->page == 16) ? 0 : -EINVAL; diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h index 5be9c563350d..66932ac10d15 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/priv.h @@ -82,4 +82,6 @@ int gp102_fb_new_(const struct nvkm_fb_func *, struct nvkm_device *, int, struct nvkm_fb **); bool gp102_fb_vpr_scrub_required(struct nvkm_fb *); int gp102_fb_vpr_scrub(struct nvkm_fb *); + +int gv100_fb_init_page(struct nvkm_fb *); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h index d723a9b4e3c4..ea7d66f3dd82 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ram.h @@ -70,4 +70,5 @@ int gk104_ram_new(struct nvkm_fb *, struct nvkm_ram **); int gm107_ram_new(struct nvkm_fb *, struct nvkm_ram **); int gm200_ram_new(struct nvkm_fb *, struct nvkm_ram **); int gp100_ram_new(struct nvkm_fb *, struct nvkm_ram **); +int ga102_ram_new(struct nvkm_fb *, struct nvkm_ram **); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramga102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramga102.c new file mode 100644 index 000000000000..298c136cefe0 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/fb/ramga102.c @@ -0,0 +1,40 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "ram.h" + +#include +#include +#include + +static const struct nvkm_ram_func +ga102_ram = { +}; + +int +ga102_ram_new(struct nvkm_fb *fb, struct nvkm_ram **pram) +{ + struct nvkm_device *device = fb->subdev.device; + enum nvkm_ram_type type = nvkm_fb_bios_memtype(device->bios); + u32 size = nvkm_rd32(device, 0x1183a4); + + return nvkm_ram_new_(&ga102_ram, fb, type, (u64)size << 20, pram); +} From 6f300e0a0ba8873f1225959089f8bb2897d93ec6 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 165/241] drm/nouveau/timer/ga10[024]: initial support Appears to be compatible with GK20A code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 4a5d43fe3f3a..6a1920965c01 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2662,6 +2662,7 @@ nv170_chipset = { .imem = nv50_instmem_new, .mc = ga100_mc_new, .pci = gp100_pci_new, + .timer = gk20a_timer_new, }; static const struct nvkm_device_chip @@ -2674,6 +2675,7 @@ nv172_chipset = { .imem = nv50_instmem_new, .mc = ga100_mc_new, .pci = gp100_pci_new, + .timer = gk20a_timer_new, }; static const struct nvkm_device_chip @@ -2686,6 +2688,7 @@ nv174_chipset = { .imem = nv50_instmem_new, .mc = ga100_mc_new, .pci = gp100_pci_new, + .timer = gk20a_timer_new, }; static int From a3abc23ac40111c76708119013d63451169e7838 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 166/241] drm/nouveau/mmu/ga10[024]: initial support Appears to be compatible with TU102 code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 6a1920965c01..ad4cece038b4 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2661,6 +2661,7 @@ nv170_chipset = { .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, + .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, }; @@ -2674,6 +2675,7 @@ nv172_chipset = { .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, + .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, }; @@ -2687,6 +2689,7 @@ nv174_chipset = { .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, + .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, }; From f5cbe7c8bd1ac6f8c91179de381e10ee5f0f8809 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 167/241] drm/nouveau/bar/ga10[024]: initial support Appears to be compatible with TU102 code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index ad4cece038b4..c9cc08ba1444 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2655,6 +2655,7 @@ nv168_chipset = { static const struct nvkm_device_chip nv170_chipset = { .name = "GA100", + .bar = tu102_bar_new, .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .fb = ga100_fb_new, @@ -2669,6 +2670,7 @@ nv170_chipset = { static const struct nvkm_device_chip nv172_chipset = { .name = "GA102", + .bar = tu102_bar_new, .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .fb = ga102_fb_new, @@ -2683,6 +2685,7 @@ nv172_chipset = { static const struct nvkm_device_chip nv174_chipset = { .name = "GA104", + .bar = tu102_bar_new, .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .fb = ga102_fb_new, From c28efb15f9e51a96c6bce2b92c0f3a4da87db877 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 168/241] drm/nouveau/gpio/ga10[024]: initial support GA100 appears to be compatible with GK104 code, the others have some register moves. Signed-off-by: Ben Skeggs --- .../drm/nouveau/include/nvkm/subdev/gpio.h | 1 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 3 + .../gpu/drm/nouveau/nvkm/subdev/gpio/Kbuild | 1 + .../gpu/drm/nouveau/nvkm/subdev/gpio/ga102.c | 118 ++++++++++++++++++ 4 files changed, 123 insertions(+) create mode 100644 drivers/gpu/drm/nouveau/nvkm/subdev/gpio/ga102.c diff --git a/drivers/gpu/drm/nouveau/include/nvkm/subdev/gpio.h b/drivers/gpu/drm/nouveau/include/nvkm/subdev/gpio.h index eaacf8d80527..cdcce5ece6ff 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/subdev/gpio.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/subdev/gpio.h @@ -37,4 +37,5 @@ int nv50_gpio_new(struct nvkm_device *, int, struct nvkm_gpio **); int g94_gpio_new(struct nvkm_device *, int, struct nvkm_gpio **); int gf119_gpio_new(struct nvkm_device *, int, struct nvkm_gpio **); int gk104_gpio_new(struct nvkm_device *, int, struct nvkm_gpio **); +int ga102_gpio_new(struct nvkm_device *, int, struct nvkm_gpio **); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index c9cc08ba1444..20d947808ef7 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2659,6 +2659,7 @@ nv170_chipset = { .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .fb = ga100_fb_new, + .gpio = gk104_gpio_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, @@ -2674,6 +2675,7 @@ nv172_chipset = { .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .fb = ga102_fb_new, + .gpio = ga102_gpio_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, @@ -2689,6 +2691,7 @@ nv174_chipset = { .bios = nvkm_bios_new, .devinit = ga100_devinit_new, .fb = ga102_fb_new, + .gpio = ga102_gpio_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/Kbuild b/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/Kbuild index b2ad5922a1c2..efbbaa080de5 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/Kbuild @@ -5,3 +5,4 @@ nvkm-y += nvkm/subdev/gpio/nv50.o nvkm-y += nvkm/subdev/gpio/g94.o nvkm-y += nvkm/subdev/gpio/gf119.o nvkm-y += nvkm/subdev/gpio/gk104.o +nvkm-y += nvkm/subdev/gpio/ga102.o diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/ga102.c b/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/ga102.c new file mode 100644 index 000000000000..62c791baf400 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/gpio/ga102.c @@ -0,0 +1,118 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "priv.h" + +static void +ga102_gpio_reset(struct nvkm_gpio *gpio, u8 match) +{ + struct nvkm_device *device = gpio->subdev.device; + struct nvkm_bios *bios = device->bios; + u8 ver, len; + u16 entry; + int ent = -1; + + while ((entry = dcb_gpio_entry(bios, 0, ++ent, &ver, &len))) { + u32 data = nvbios_rd32(bios, entry); + u8 line = (data & 0x0000003f); + u8 defs = !!(data & 0x00000080); + u8 func = (data & 0x0000ff00) >> 8; + u8 unk0 = (data & 0x00ff0000) >> 16; + u8 unk1 = (data & 0x1f000000) >> 24; + + if ( func == DCB_GPIO_UNUSED || + (match != DCB_GPIO_UNUSED && match != func)) + continue; + + nvkm_gpio_set(gpio, 0, func, line, defs); + + nvkm_mask(device, 0x021200 + (line * 4), 0xff, unk0); + if (unk1--) + nvkm_mask(device, 0x00d740 + (unk1 * 4), 0xff, line); + } +} + +static int +ga102_gpio_drive(struct nvkm_gpio *gpio, int line, int dir, int out) +{ + struct nvkm_device *device = gpio->subdev.device; + u32 data = ((dir ^ 1) << 13) | (out << 12); + nvkm_mask(device, 0x021200 + (line * 4), 0x00003000, data); + nvkm_mask(device, 0x00d604, 0x00000001, 0x00000001); /* update? */ + return 0; +} + +static int +ga102_gpio_sense(struct nvkm_gpio *gpio, int line) +{ + struct nvkm_device *device = gpio->subdev.device; + return !!(nvkm_rd32(device, 0x021200 + (line * 4)) & 0x00004000); +} + +static void +ga102_gpio_intr_stat(struct nvkm_gpio *gpio, u32 *hi, u32 *lo) +{ + struct nvkm_device *device = gpio->subdev.device; + u32 intr0 = nvkm_rd32(device, 0x021640); + u32 intr1 = nvkm_rd32(device, 0x02164c); + u32 stat0 = nvkm_rd32(device, 0x021648) & intr0; + u32 stat1 = nvkm_rd32(device, 0x021654) & intr1; + *lo = (stat1 & 0xffff0000) | (stat0 >> 16); + *hi = (stat1 << 16) | (stat0 & 0x0000ffff); + nvkm_wr32(device, 0x021640, intr0); + nvkm_wr32(device, 0x02164c, intr1); +} + +static void +ga102_gpio_intr_mask(struct nvkm_gpio *gpio, u32 type, u32 mask, u32 data) +{ + struct nvkm_device *device = gpio->subdev.device; + u32 inte0 = nvkm_rd32(device, 0x021648); + u32 inte1 = nvkm_rd32(device, 0x021654); + if (type & NVKM_GPIO_LO) + inte0 = (inte0 & ~(mask << 16)) | (data << 16); + if (type & NVKM_GPIO_HI) + inte0 = (inte0 & ~(mask & 0xffff)) | (data & 0xffff); + mask >>= 16; + data >>= 16; + if (type & NVKM_GPIO_LO) + inte1 = (inte1 & ~(mask << 16)) | (data << 16); + if (type & NVKM_GPIO_HI) + inte1 = (inte1 & ~mask) | data; + nvkm_wr32(device, 0x021648, inte0); + nvkm_wr32(device, 0x021654, inte1); +} + +static const struct nvkm_gpio_func +ga102_gpio = { + .lines = 32, + .intr_stat = ga102_gpio_intr_stat, + .intr_mask = ga102_gpio_intr_mask, + .drive = ga102_gpio_drive, + .sense = ga102_gpio_sense, + .reset = ga102_gpio_reset, +}; + +int +ga102_gpio_new(struct nvkm_device *device, int index, struct nvkm_gpio **pgpio) +{ + return nvkm_gpio_new_(&ga102_gpio, device, index, pgpio); +} From 8a0412265f06490d93724bf8badf220180790ad1 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 169/241] drm/nouveau/i2c/ga10[024]: initial support Appears to be compatible with GM200 code. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 20d947808ef7..946b0001ca54 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2660,6 +2660,7 @@ nv170_chipset = { .devinit = ga100_devinit_new, .fb = ga100_fb_new, .gpio = gk104_gpio_new, + .i2c = gm200_i2c_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, @@ -2676,6 +2677,7 @@ nv172_chipset = { .devinit = ga100_devinit_new, .fb = ga102_fb_new, .gpio = ga102_gpio_new, + .i2c = gm200_i2c_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, @@ -2692,6 +2694,7 @@ nv174_chipset = { .devinit = ga100_devinit_new, .fb = ga102_fb_new, .gpio = ga102_gpio_new, + .i2c = gm200_i2c_new, .ibus = gm200_ibus_new, .imem = nv50_instmem_new, .mc = ga100_mc_new, From a6cf0320aad0c69a6b558dd41d3cb6891a6c9872 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 170/241] drm/nouveau/dmaobj/ga10[24]: initial support Appears to be compatible with GV100 code, and not required on GA100, as it shouldn't have display. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/nvkm/engine/device/base.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index 946b0001ca54..bea3daf08570 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2684,6 +2684,7 @@ nv172_chipset = { .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, + .dma = gv100_dma_new, }; static const struct nvkm_device_chip @@ -2701,6 +2702,7 @@ nv174_chipset = { .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, + .dma = gv100_dma_new, }; static int From 8ef23b6f6a79e6fa2a169081d2d76011fffa0482 Mon Sep 17 00:00:00 2001 From: Ben Skeggs Date: Wed, 13 Jan 2021 17:12:52 +1000 Subject: [PATCH 171/241] drm/nouveau/disp/ga10[24]: initial support UEFI/RM no longer use IED scripts from the VBIOS, though they appear to have been updated for use by the x86 VBIOS code, so we should be able to continue using them for the moment. Unfortunately, we require some hacks to do so, as the BeforeLinkTraining IED script became a pointer to an array of scripts instead, without a revbump of the relevant tables. There's also some changes to SOR clock divider fiddling, which are hopefully correct enough that things work as they should. AFAIK, GA100 shouldn't have display, so it hasn't been added. Signed-off-by: Ben Skeggs --- drivers/gpu/drm/nouveau/dispnv50/Kbuild | 1 + drivers/gpu/drm/nouveau/dispnv50/core.c | 1 + drivers/gpu/drm/nouveau/dispnv50/curs.c | 1 + drivers/gpu/drm/nouveau/dispnv50/wimm.c | 1 + drivers/gpu/drm/nouveau/dispnv50/wndw.c | 1 + drivers/gpu/drm/nouveau/dispnv50/wndw.h | 8 + drivers/gpu/drm/nouveau/dispnv50/wndwc57e.c | 10 +- drivers/gpu/drm/nouveau/dispnv50/wndwc67e.c | 106 +++++++++++++ drivers/gpu/drm/nouveau/include/nvif/class.h | 5 + .../drm/nouveau/include/nvkm/engine/disp.h | 1 + drivers/gpu/drm/nouveau/nvif/disp.c | 1 + .../gpu/drm/nouveau/nvkm/engine/device/base.c | 2 + .../gpu/drm/nouveau/nvkm/engine/disp/Kbuild | 3 + drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c | 33 ++++- .../gpu/drm/nouveau/nvkm/engine/disp/ga102.c | 46 ++++++ .../gpu/drm/nouveau/nvkm/engine/disp/ior.h | 4 + .../gpu/drm/nouveau/nvkm/engine/disp/nv50.h | 2 + .../drm/nouveau/nvkm/engine/disp/rootga102.c | 52 +++++++ .../drm/nouveau/nvkm/engine/disp/rootnv50.h | 1 + .../drm/nouveau/nvkm/engine/disp/sorga102.c | 140 ++++++++++++++++++ .../drm/nouveau/nvkm/engine/disp/sortu102.c | 2 +- .../gpu/drm/nouveau/nvkm/engine/disp/tu102.c | 2 +- 22 files changed, 410 insertions(+), 13 deletions(-) create mode 100644 drivers/gpu/drm/nouveau/dispnv50/wndwc67e.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/disp/ga102.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/disp/rootga102.c create mode 100644 drivers/gpu/drm/nouveau/nvkm/engine/disp/sorga102.c diff --git a/drivers/gpu/drm/nouveau/dispnv50/Kbuild b/drivers/gpu/drm/nouveau/dispnv50/Kbuild index 6fdddb266fb1..4488e1c061b3 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/Kbuild +++ b/drivers/gpu/drm/nouveau/dispnv50/Kbuild @@ -37,6 +37,7 @@ nouveau-y += dispnv50/wimmc37b.o nouveau-y += dispnv50/wndw.o nouveau-y += dispnv50/wndwc37e.o nouveau-y += dispnv50/wndwc57e.o +nouveau-y += dispnv50/wndwc67e.o nouveau-y += dispnv50/base.o nouveau-y += dispnv50/base507c.o diff --git a/drivers/gpu/drm/nouveau/dispnv50/core.c b/drivers/gpu/drm/nouveau/dispnv50/core.c index 27ea3f34706d..abefc2343443 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/core.c +++ b/drivers/gpu/drm/nouveau/dispnv50/core.c @@ -42,6 +42,7 @@ nv50_core_new(struct nouveau_drm *drm, struct nv50_core **pcore) int version; int (*new)(struct nouveau_drm *, s32, struct nv50_core **); } cores[] = { + { GA102_DISP_CORE_CHANNEL_DMA, 0, corec57d_new }, { TU102_DISP_CORE_CHANNEL_DMA, 0, corec57d_new }, { GV100_DISP_CORE_CHANNEL_DMA, 0, corec37d_new }, { GP102_DISP_CORE_CHANNEL_DMA, 0, core917d_new }, diff --git a/drivers/gpu/drm/nouveau/dispnv50/curs.c b/drivers/gpu/drm/nouveau/dispnv50/curs.c index 121c24a18f11..31d8b2e4791d 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/curs.c +++ b/drivers/gpu/drm/nouveau/dispnv50/curs.c @@ -31,6 +31,7 @@ nv50_curs_new(struct nouveau_drm *drm, int head, struct nv50_wndw **pwndw) int version; int (*new)(struct nouveau_drm *, int, s32, struct nv50_wndw **); } curses[] = { + { GA102_DISP_CURSOR, 0, cursc37a_new }, { TU102_DISP_CURSOR, 0, cursc37a_new }, { GV100_DISP_CURSOR, 0, cursc37a_new }, { GK104_DISP_CURSOR, 0, curs907a_new }, diff --git a/drivers/gpu/drm/nouveau/dispnv50/wimm.c b/drivers/gpu/drm/nouveau/dispnv50/wimm.c index a1ac153d5e98..566fbddfc8d7 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wimm.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wimm.c @@ -31,6 +31,7 @@ nv50_wimm_init(struct nouveau_drm *drm, struct nv50_wndw *wndw) int version; int (*init)(struct nouveau_drm *, s32, struct nv50_wndw *); } wimms[] = { + { GA102_DISP_WINDOW_IMM_CHANNEL_DMA, 0, wimmc37b_init }, { TU102_DISP_WINDOW_IMM_CHANNEL_DMA, 0, wimmc37b_init }, { GV100_DISP_WINDOW_IMM_CHANNEL_DMA, 0, wimmc37b_init }, {} diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.c b/drivers/gpu/drm/nouveau/dispnv50/wndw.c index 0356474ad6f6..ce451242f79e 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.c @@ -784,6 +784,7 @@ nv50_wndw_new(struct nouveau_drm *drm, enum drm_plane_type type, int index, int (*new)(struct nouveau_drm *, enum drm_plane_type, int, s32, struct nv50_wndw **); } wndws[] = { + { GA102_DISP_WINDOW_CHANNEL_DMA, 0, wndwc67e_new }, { TU102_DISP_WINDOW_CHANNEL_DMA, 0, wndwc57e_new }, { GV100_DISP_WINDOW_CHANNEL_DMA, 0, wndwc37e_new }, {} diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndw.h b/drivers/gpu/drm/nouveau/dispnv50/wndw.h index 3278e2880034..f4e0c5080034 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndw.h +++ b/drivers/gpu/drm/nouveau/dispnv50/wndw.h @@ -129,6 +129,14 @@ int wndwc37e_update(struct nv50_wndw *, u32 *); int wndwc57e_new(struct nouveau_drm *, enum drm_plane_type, int, s32, struct nv50_wndw **); +bool wndwc57e_ilut(struct nv50_wndw *, struct nv50_wndw_atom *, int); +int wndwc57e_ilut_set(struct nv50_wndw *, struct nv50_wndw_atom *); +int wndwc57e_ilut_clr(struct nv50_wndw *); +int wndwc57e_csc_set(struct nv50_wndw *, struct nv50_wndw_atom *); +int wndwc57e_csc_clr(struct nv50_wndw *); + +int wndwc67e_new(struct nouveau_drm *, enum drm_plane_type, int, s32, + struct nv50_wndw **); int nv50_wndw_new(struct nouveau_drm *, enum drm_plane_type, int index, struct nv50_wndw **); diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndwc57e.c b/drivers/gpu/drm/nouveau/dispnv50/wndwc57e.c index 429be0bb0222..abdd3bb658b3 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/wndwc57e.c +++ b/drivers/gpu/drm/nouveau/dispnv50/wndwc57e.c @@ -80,7 +80,7 @@ wndwc57e_image_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw) return 0; } -static int +int wndwc57e_csc_clr(struct nv50_wndw *wndw) { struct nvif_push *push = wndw->wndw.push; @@ -98,7 +98,7 @@ wndwc57e_csc_clr(struct nv50_wndw *wndw) return 0; } -static int +int wndwc57e_csc_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw) { struct nvif_push *push = wndw->wndw.push; @@ -111,7 +111,7 @@ wndwc57e_csc_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw) return 0; } -static int +int wndwc57e_ilut_clr(struct nv50_wndw *wndw) { struct nvif_push *push = wndw->wndw.push; @@ -124,7 +124,7 @@ wndwc57e_ilut_clr(struct nv50_wndw *wndw) return 0; } -static int +int wndwc57e_ilut_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw) { struct nvif_push *push = wndw->wndw.push; @@ -179,7 +179,7 @@ wndwc57e_ilut_load(struct drm_color_lut *in, int size, void __iomem *mem) writew(readw(mem - 4), mem + 4); } -static bool +bool wndwc57e_ilut(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw, int size) { if (size = size ? size : 1024, size != 256 && size != 1024) diff --git a/drivers/gpu/drm/nouveau/dispnv50/wndwc67e.c b/drivers/gpu/drm/nouveau/dispnv50/wndwc67e.c new file mode 100644 index 000000000000..7a370fa1df20 --- /dev/null +++ b/drivers/gpu/drm/nouveau/dispnv50/wndwc67e.c @@ -0,0 +1,106 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "wndw.h" +#include "atom.h" + +#include + +#include + +static int +wndwc67e_image_set(struct nv50_wndw *wndw, struct nv50_wndw_atom *asyw) +{ + struct nvif_push *push = wndw->wndw.push; + int ret; + + if ((ret = PUSH_WAIT(push, 17))) + return ret; + + PUSH_MTHD(push, NVC57E, SET_PRESENT_CONTROL, + NVVAL(NVC57E, SET_PRESENT_CONTROL, MIN_PRESENT_INTERVAL, asyw->image.interval) | + NVVAL(NVC57E, SET_PRESENT_CONTROL, BEGIN_MODE, asyw->image.mode) | + NVDEF(NVC57E, SET_PRESENT_CONTROL, TIMESTAMP_MODE, DISABLE)); + + PUSH_MTHD(push, NVC57E, SET_SIZE, + NVVAL(NVC57E, SET_SIZE, WIDTH, asyw->image.w) | + NVVAL(NVC57E, SET_SIZE, HEIGHT, asyw->image.h), + + SET_STORAGE, + NVVAL(NVC57E, SET_STORAGE, BLOCK_HEIGHT, asyw->image.blockh), + + SET_PARAMS, + NVVAL(NVC57E, SET_PARAMS, FORMAT, asyw->image.format) | + NVDEF(NVC57E, SET_PARAMS, CLAMP_BEFORE_BLEND, DISABLE) | + NVDEF(NVC57E, SET_PARAMS, SWAP_UV, DISABLE) | + NVDEF(NVC57E, SET_PARAMS, FMT_ROUNDING_MODE, ROUND_TO_NEAREST), + + SET_PLANAR_STORAGE(0), + NVVAL(NVC57E, SET_PLANAR_STORAGE, PITCH, asyw->image.blocks[0]) | + NVVAL(NVC57E, SET_PLANAR_STORAGE, PITCH, asyw->image.pitch[0] >> 6)); + + PUSH_MTHD(push, NVC57E, SET_CONTEXT_DMA_ISO(0), asyw->image.handle, 1); + PUSH_MTHD(push, NVC57E, SET_OFFSET(0), asyw->image.offset[0] >> 8); + + PUSH_MTHD(push, NVC57E, SET_POINT_IN(0), + NVVAL(NVC57E, SET_POINT_IN, X, asyw->state.src_x >> 16) | + NVVAL(NVC57E, SET_POINT_IN, Y, asyw->state.src_y >> 16)); + + PUSH_MTHD(push, NVC57E, SET_SIZE_IN, + NVVAL(NVC57E, SET_SIZE_IN, WIDTH, asyw->state.src_w >> 16) | + NVVAL(NVC57E, SET_SIZE_IN, HEIGHT, asyw->state.src_h >> 16)); + + PUSH_MTHD(push, NVC57E, SET_SIZE_OUT, + NVVAL(NVC57E, SET_SIZE_OUT, WIDTH, asyw->state.crtc_w) | + NVVAL(NVC57E, SET_SIZE_OUT, HEIGHT, asyw->state.crtc_h)); + return 0; +} + +static const struct nv50_wndw_func +wndwc67e = { + .acquire = wndwc37e_acquire, + .release = wndwc37e_release, + .sema_set = wndwc37e_sema_set, + .sema_clr = wndwc37e_sema_clr, + .ntfy_set = wndwc37e_ntfy_set, + .ntfy_clr = wndwc37e_ntfy_clr, + .ntfy_reset = corec37d_ntfy_init, + .ntfy_wait_begun = base507c_ntfy_wait_begun, + .ilut = wndwc57e_ilut, + .ilut_identity = true, + .ilut_size = 1024, + .xlut_set = wndwc57e_ilut_set, + .xlut_clr = wndwc57e_ilut_clr, + .csc = base907c_csc, + .csc_set = wndwc57e_csc_set, + .csc_clr = wndwc57e_csc_clr, + .image_set = wndwc67e_image_set, + .image_clr = wndwc37e_image_clr, + .blend_set = wndwc37e_blend_set, + .update = wndwc37e_update, +}; + +int +wndwc67e_new(struct nouveau_drm *drm, enum drm_plane_type type, int index, + s32 oclass, struct nv50_wndw **pwndw) +{ + return wndwc37e_new_(&wndwc67e, drm, type, index, oclass, BIT(index >> 1), pwndw); +} diff --git a/drivers/gpu/drm/nouveau/include/nvif/class.h b/drivers/gpu/drm/nouveau/include/nvif/class.h index 2c79beb41126..ba2c28ea43d2 100644 --- a/drivers/gpu/drm/nouveau/include/nvif/class.h +++ b/drivers/gpu/drm/nouveau/include/nvif/class.h @@ -88,6 +88,7 @@ #define GP102_DISP /* cl5070.h */ 0x00009870 #define GV100_DISP /* cl5070.h */ 0x0000c370 #define TU102_DISP /* cl5070.h */ 0x0000c570 +#define GA102_DISP /* cl5070.h */ 0x0000c670 #define GV100_DISP_CAPS 0x0000c373 @@ -103,6 +104,7 @@ #define GK104_DISP_CURSOR /* cl507a.h */ 0x0000917a #define GV100_DISP_CURSOR /* cl507a.h */ 0x0000c37a #define TU102_DISP_CURSOR /* cl507a.h */ 0x0000c57a +#define GA102_DISP_CURSOR /* cl507a.h */ 0x0000c67a #define NV50_DISP_OVERLAY /* cl507b.h */ 0x0000507b #define G82_DISP_OVERLAY /* cl507b.h */ 0x0000827b @@ -112,6 +114,7 @@ #define GV100_DISP_WINDOW_IMM_CHANNEL_DMA /* clc37b.h */ 0x0000c37b #define TU102_DISP_WINDOW_IMM_CHANNEL_DMA /* clc37b.h */ 0x0000c57b +#define GA102_DISP_WINDOW_IMM_CHANNEL_DMA /* clc37b.h */ 0x0000c67b #define NV50_DISP_BASE_CHANNEL_DMA /* cl507c.h */ 0x0000507c #define G82_DISP_BASE_CHANNEL_DMA /* cl507c.h */ 0x0000827c @@ -135,6 +138,7 @@ #define GP102_DISP_CORE_CHANNEL_DMA /* cl507d.h */ 0x0000987d #define GV100_DISP_CORE_CHANNEL_DMA /* cl507d.h */ 0x0000c37d #define TU102_DISP_CORE_CHANNEL_DMA /* cl507d.h */ 0x0000c57d +#define GA102_DISP_CORE_CHANNEL_DMA /* cl507d.h */ 0x0000c67d #define NV50_DISP_OVERLAY_CHANNEL_DMA /* cl507e.h */ 0x0000507e #define G82_DISP_OVERLAY_CHANNEL_DMA /* cl507e.h */ 0x0000827e @@ -145,6 +149,7 @@ #define GV100_DISP_WINDOW_CHANNEL_DMA /* clc37e.h */ 0x0000c37e #define TU102_DISP_WINDOW_CHANNEL_DMA /* clc37e.h */ 0x0000c57e +#define GA102_DISP_WINDOW_CHANNEL_DMA /* clc37e.h */ 0x0000c67e #define NV50_TESLA 0x00005097 #define G82_TESLA 0x00008297 diff --git a/drivers/gpu/drm/nouveau/include/nvkm/engine/disp.h b/drivers/gpu/drm/nouveau/include/nvkm/engine/disp.h index 5a96c942d912..0f6fa6631a19 100644 --- a/drivers/gpu/drm/nouveau/include/nvkm/engine/disp.h +++ b/drivers/gpu/drm/nouveau/include/nvkm/engine/disp.h @@ -37,4 +37,5 @@ int gp100_disp_new(struct nvkm_device *, int, struct nvkm_disp **); int gp102_disp_new(struct nvkm_device *, int, struct nvkm_disp **); int gv100_disp_new(struct nvkm_device *, int, struct nvkm_disp **); int tu102_disp_new(struct nvkm_device *, int, struct nvkm_disp **); +int ga102_disp_new(struct nvkm_device *, int, struct nvkm_disp **); #endif diff --git a/drivers/gpu/drm/nouveau/nvif/disp.c b/drivers/gpu/drm/nouveau/nvif/disp.c index 8d0d30e08f57..529cb60d5efb 100644 --- a/drivers/gpu/drm/nouveau/nvif/disp.c +++ b/drivers/gpu/drm/nouveau/nvif/disp.c @@ -35,6 +35,7 @@ nvif_disp_ctor(struct nvif_device *device, const char *name, s32 oclass, struct nvif_disp *disp) { static const struct nvif_mclass disps[] = { + { GA102_DISP, -1 }, { TU102_DISP, -1 }, { GV100_DISP, -1 }, { GP102_DISP, -1 }, diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c index bea3daf08570..cdcc851e06f9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/device/base.c @@ -2684,6 +2684,7 @@ nv172_chipset = { .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, + .disp = ga102_disp_new, .dma = gv100_dma_new, }; @@ -2702,6 +2703,7 @@ nv174_chipset = { .mmu = tu102_mmu_new, .pci = gp100_pci_new, .timer = gk20a_timer_new, + .disp = ga102_disp_new, .dma = gv100_dma_new, }; diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild b/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild index cf075311cdd2..b03f043efe26 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/Kbuild @@ -17,6 +17,7 @@ nvkm-y += nvkm/engine/disp/gp100.o nvkm-y += nvkm/engine/disp/gp102.o nvkm-y += nvkm/engine/disp/gv100.o nvkm-y += nvkm/engine/disp/tu102.o +nvkm-y += nvkm/engine/disp/ga102.o nvkm-y += nvkm/engine/disp/vga.o nvkm-y += nvkm/engine/disp/head.o @@ -42,6 +43,7 @@ nvkm-y += nvkm/engine/disp/sorgm200.o nvkm-y += nvkm/engine/disp/sorgp100.o nvkm-y += nvkm/engine/disp/sorgv100.o nvkm-y += nvkm/engine/disp/sortu102.o +nvkm-y += nvkm/engine/disp/sorga102.o nvkm-y += nvkm/engine/disp/outp.o nvkm-y += nvkm/engine/disp/dp.o @@ -75,6 +77,7 @@ nvkm-y += nvkm/engine/disp/rootgp100.o nvkm-y += nvkm/engine/disp/rootgp102.o nvkm-y += nvkm/engine/disp/rootgv100.o nvkm-y += nvkm/engine/disp/roottu102.o +nvkm-y += nvkm/engine/disp/rootga102.o nvkm-y += nvkm/engine/disp/capsgv100.o diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c index 3800aeb507d0..55fbfe28c6dc 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/dp.c @@ -33,6 +33,12 @@ #include +/* IED scripts are no longer used by UEFI/RM from Ampere, but have been updated for + * the x86 option ROM. However, the relevant VBIOS table versions weren't modified, + * so we're unable to detect this in a nice way. + */ +#define AMPERE_IED_HACK(disp) ((disp)->engine.subdev.device->card_type >= GA100) + struct lt_state { struct nvkm_dp *dp; u8 stat[6]; @@ -238,6 +244,19 @@ nvkm_dp_train_links(struct nvkm_dp *dp) dp->dpcd[DPCD_RC02] &= ~DPCD_RC02_TPS3_SUPPORTED; lt.pc2 = dp->dpcd[DPCD_RC02] & DPCD_RC02_TPS3_SUPPORTED; + if (AMPERE_IED_HACK(disp) && (lnkcmp = lt.dp->info.script[0])) { + /* Execute BeforeLinkTraining script from DP Info table. */ + while (ior->dp.bw < nvbios_rd08(bios, lnkcmp)) + lnkcmp += 3; + lnkcmp = nvbios_rd16(bios, lnkcmp + 1); + + nvbios_init(&dp->outp.disp->engine.subdev, lnkcmp, + init.outp = &dp->outp.info; + init.or = ior->id; + init.link = ior->asy.link; + ); + } + /* Set desired link configuration on the source. */ if ((lnkcmp = lt.dp->info.lnkcmp)) { if (dp->version < 0x30) { @@ -316,12 +335,14 @@ nvkm_dp_train_init(struct nvkm_dp *dp) ); } - /* Execute BeforeLinkTraining script from DP Info table. */ - nvbios_init(&dp->outp.disp->engine.subdev, dp->info.script[0], - init.outp = &dp->outp.info; - init.or = dp->outp.ior->id; - init.link = dp->outp.ior->asy.link; - ); + if (!AMPERE_IED_HACK(dp->outp.disp)) { + /* Execute BeforeLinkTraining script from DP Info table. */ + nvbios_init(&dp->outp.disp->engine.subdev, dp->info.script[0], + init.outp = &dp->outp.info; + init.or = dp->outp.ior->id; + init.link = dp->outp.ior->asy.link; + ); + } } static const struct dp_rates { diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ga102.c new file mode 100644 index 000000000000..aa2e5645fe36 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ga102.c @@ -0,0 +1,46 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "nv50.h" +#include "head.h" +#include "ior.h" +#include "channv50.h" +#include "rootnv50.h" + +static const struct nv50_disp_func +ga102_disp = { + .init = tu102_disp_init, + .fini = gv100_disp_fini, + .intr = gv100_disp_intr, + .uevent = &gv100_disp_chan_uevent, + .super = gv100_disp_super, + .root = &ga102_disp_root_oclass, + .wndw = { .cnt = gv100_disp_wndw_cnt }, + .head = { .cnt = gv100_head_cnt, .new = gv100_head_new }, + .sor = { .cnt = gv100_sor_cnt, .new = ga102_sor_new }, + .ramht_size = 0x2000, +}; + +int +ga102_disp_new(struct nvkm_device *device, int index, struct nvkm_disp **pdisp) +{ + return nv50_disp_new_(&ga102_disp, device, index, pdisp); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h index 09f3038eff26..9f0bb7c6b010 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/ior.h @@ -150,6 +150,8 @@ void gv100_sor_dp_audio(struct nvkm_ior *, int, bool); void gv100_sor_dp_audio_sym(struct nvkm_ior *, int, u16, u32); void gv100_sor_dp_watermark(struct nvkm_ior *, int, u8); +void tu102_sor_dp_vcpi(struct nvkm_ior *, int, u8, u8, u16, u16); + void g84_hdmi_ctrl(struct nvkm_ior *, int, bool, u8, u8, u8 *, u8 , u8 *, u8); void gt215_hdmi_ctrl(struct nvkm_ior *, int, bool, u8, u8, u8 *, u8 , u8 *, u8); void gf119_hdmi_ctrl(struct nvkm_ior *, int, bool, u8, u8, u8 *, u8 , u8 *, u8); @@ -207,4 +209,6 @@ int gv100_sor_cnt(struct nvkm_disp *, unsigned long *); int gv100_sor_new(struct nvkm_disp *, int); int tu102_sor_new(struct nvkm_disp *, int); + +int ga102_sor_new(struct nvkm_disp *, int); #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/nv50.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/nv50.h index a677161c7f3a..db31b37752a2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/nv50.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/nv50.h @@ -86,6 +86,8 @@ void gv100_disp_intr(struct nv50_disp *); void gv100_disp_super(struct work_struct *); int gv100_disp_wndw_cnt(struct nvkm_disp *, unsigned long *); +int tu102_disp_init(struct nv50_disp *); + void nv50_disp_dptmds_war_2(struct nv50_disp *, struct dcb_output *); void nv50_disp_dptmds_war_3(struct nv50_disp *, struct dcb_output *); void nv50_disp_update_sppll1(struct nv50_disp *); diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootga102.c new file mode 100644 index 000000000000..9af07c3cf9fc --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootga102.c @@ -0,0 +1,52 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "rootnv50.h" +#include "channv50.h" + +#include + +static const struct nv50_disp_root_func +ga102_disp_root = { + .user = { + {{-1,-1,GV100_DISP_CAPS }, gv100_disp_caps_new }, + {{0,0,GA102_DISP_CURSOR }, gv100_disp_curs_new }, + {{0,0,GA102_DISP_WINDOW_IMM_CHANNEL_DMA}, gv100_disp_wimm_new }, + {{0,0,GA102_DISP_CORE_CHANNEL_DMA }, gv100_disp_core_new }, + {{0,0,GA102_DISP_WINDOW_CHANNEL_DMA }, gv100_disp_wndw_new }, + {} + }, +}; + +static int +ga102_disp_root_new(struct nvkm_disp *disp, const struct nvkm_oclass *oclass, + void *data, u32 size, struct nvkm_object **pobject) +{ + return nv50_disp_root_new_(&ga102_disp_root, disp, oclass, data, size, pobject); +} + +const struct nvkm_disp_oclass +ga102_disp_root_oclass = { + .base.oclass = GA102_DISP, + .base.minver = -1, + .base.maxver = -1, + .ctor = ga102_disp_root_new, +}; diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.h b/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.h index 7070f5408d92..27bb170d0293 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.h +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/rootnv50.h @@ -41,4 +41,5 @@ extern const struct nvkm_disp_oclass gp100_disp_root_oclass; extern const struct nvkm_disp_oclass gp102_disp_root_oclass; extern const struct nvkm_disp_oclass gv100_disp_root_oclass; extern const struct nvkm_disp_oclass tu102_disp_root_oclass; +extern const struct nvkm_disp_oclass ga102_disp_root_oclass; #endif diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorga102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorga102.c new file mode 100644 index 000000000000..033827de9116 --- /dev/null +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sorga102.c @@ -0,0 +1,140 @@ +/* + * Copyright 2021 Red Hat Inc. + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#include "ior.h" + +#include + +static int +ga102_sor_dp_links(struct nvkm_ior *sor, struct nvkm_i2c_aux *aux) +{ + struct nvkm_device *device = sor->disp->engine.subdev.device; + const u32 soff = nv50_ior_base(sor); + const u32 loff = nv50_sor_link(sor); + u32 dpctrl = 0x00000000; + u32 clksor = 0x00000000; + + switch (sor->dp.bw) { + case 0x06: clksor |= 0x00000000; break; + case 0x0a: clksor |= 0x00040000; break; + case 0x14: clksor |= 0x00080000; break; + case 0x1e: clksor |= 0x000c0000; break; + default: + WARN_ON(1); + return -EINVAL; + } + + dpctrl |= ((1 << sor->dp.nr) - 1) << 16; + if (sor->dp.mst) + dpctrl |= 0x40000000; + if (sor->dp.ef) + dpctrl |= 0x00004000; + + nvkm_mask(device, 0x612300 + soff, 0x007c0000, clksor); + + /*XXX*/ + nvkm_msec(device, 40, NVKM_DELAY); + nvkm_mask(device, 0x612300 + soff, 0x00030000, 0x00010000); + nvkm_mask(device, 0x61c10c + loff, 0x00000003, 0x00000001); + + nvkm_mask(device, 0x61c10c + loff, 0x401f4000, dpctrl); + return 0; +} + +static void +ga102_sor_clock(struct nvkm_ior *sor) +{ + struct nvkm_device *device = sor->disp->engine.subdev.device; + u32 div2 = 0; + if (sor->asy.proto == TMDS) { + if (sor->tmds.high_speed) + div2 = 1; + } + nvkm_wr32(device, 0x00ec08 + (sor->id * 0x10), 0x00000000); + nvkm_wr32(device, 0x00ec04 + (sor->id * 0x10), div2); +} + +static const struct nvkm_ior_func +ga102_sor_hda = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gv100_sor_state, + .power = nv50_sor_power, + .clock = ga102_sor_clock, + .hdmi = { + .ctrl = gv100_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = ga102_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .vcpi = tu102_sor_dp_vcpi, + .audio = gv100_sor_dp_audio, + .audio_sym = gv100_sor_dp_audio_sym, + .watermark = gv100_sor_dp_watermark, + }, + .hda = { + .hpd = gf119_hda_hpd, + .eld = gf119_hda_eld, + .device_entry = gv100_hda_device_entry, + }, +}; + +static const struct nvkm_ior_func +ga102_sor = { + .route = { + .get = gm200_sor_route_get, + .set = gm200_sor_route_set, + }, + .state = gv100_sor_state, + .power = nv50_sor_power, + .clock = ga102_sor_clock, + .hdmi = { + .ctrl = gv100_hdmi_ctrl, + .scdc = gm200_hdmi_scdc, + }, + .dp = { + .lanes = { 0, 1, 2, 3 }, + .links = ga102_sor_dp_links, + .power = g94_sor_dp_power, + .pattern = gm107_sor_dp_pattern, + .drive = gm200_sor_dp_drive, + .vcpi = tu102_sor_dp_vcpi, + .audio = gv100_sor_dp_audio, + .audio_sym = gv100_sor_dp_audio_sym, + .watermark = gv100_sor_dp_watermark, + }, +}; + +int +ga102_sor_new(struct nvkm_disp *disp, int id) +{ + struct nvkm_device *device = disp->engine.subdev.device; + u32 hda = nvkm_rd32(device, 0x08a15c); + if (hda & BIT(id)) + return nvkm_ior_new_(&ga102_sor_hda, disp, SOR, id); + return nvkm_ior_new_(&ga102_sor, disp, SOR, id); +} diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c index 59865a934c4b..0cf9e8752d25 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/sortu102.c @@ -23,7 +23,7 @@ #include -static void +void tu102_sor_dp_vcpi(struct nvkm_ior *sor, int head, u8 slot, u8 slot_nr, u16 pbn, u16 aligned) { diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/tu102.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/tu102.c index 883ae4151ff8..4c85d1d4fbd4 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/tu102.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/tu102.c @@ -28,7 +28,7 @@ #include #include -static int +int tu102_disp_init(struct nv50_disp *disp) { struct nvkm_device *device = disp->base.engine.subdev.device; From dec822771b0174a01e72d7641d08e44461b6a82f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 14 Jan 2021 10:46:57 +0800 Subject: [PATCH 172/241] riscv: stacktrace: Move register keyword to beginning of declaration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Using global sp_in_global directly to fix the following warning, arch/riscv/kernel/stacktrace.c:31:3: warning: ‘register’ is not at beginning of declaration [-Wold-style-declaration] 31 | const register unsigned long current_sp = sp_in_global; | ^~~~~ Signed-off-by: Kefeng Wang Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/stacktrace.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c index 48b870a685b3..df5d2da7c40b 100644 --- a/arch/riscv/kernel/stacktrace.c +++ b/arch/riscv/kernel/stacktrace.c @@ -14,7 +14,7 @@ #include -register unsigned long sp_in_global __asm__("sp"); +register const unsigned long sp_in_global __asm__("sp"); #ifdef CONFIG_FRAME_POINTER @@ -28,9 +28,8 @@ void notrace walk_stackframe(struct task_struct *task, struct pt_regs *regs, sp = user_stack_pointer(regs); pc = instruction_pointer(regs); } else if (task == NULL || task == current) { - const register unsigned long current_sp = sp_in_global; fp = (unsigned long)__builtin_frame_address(0); - sp = current_sp; + sp = sp_in_global; pc = (unsigned long)walk_stackframe; } else { /* task blocked in __switch_to */ From dca5244d2f5b94f1809f0c02a549edf41ccd5493 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 12 Jan 2021 22:48:32 +0000 Subject: [PATCH 173/241] compiler.h: Raise minimum version of GCC to 5.1 for arm64 GCC versions >= 4.9 and < 5.1 have been shown to emit memory references beyond the stack pointer, resulting in memory corruption if an interrupt is taken after the stack pointer has been adjusted but before the reference has been executed. This leads to subtle, infrequent data corruption such as the EXT4 problems reported by Russell King at the link below. Life is too short for buggy compilers, so raise the minimum GCC version required by arm64 to 5.1. Reported-by: Russell King Suggested-by: Arnd Bergmann Signed-off-by: Will Deacon Tested-by: Nathan Chancellor Reviewed-by: Nick Desaulniers Reviewed-by: Nathan Chancellor Acked-by: Linus Torvalds Cc: Cc: Theodore Ts'o Cc: Florian Weimer Cc: Peter Zijlstra Cc: Nick Desaulniers Link: https://lore.kernel.org/r/20210105154726.GD1551@shell.armlinux.org.uk Link: https://lore.kernel.org/r/20210112224832.10980-1-will@kernel.org Signed-off-by: Catalin Marinas --- include/linux/compiler-gcc.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 74c6c0486eed..555ab0fddbef 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -13,6 +13,12 @@ /* https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58145 */ #if GCC_VERSION < 40900 # error Sorry, your version of GCC is too old - please use 4.9 or newer. +#elif defined(CONFIG_ARM64) && GCC_VERSION < 50100 +/* + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63293 + * https://lore.kernel.org/r/20210107111841.GN1551@shell.armlinux.org.uk + */ +# error Sorry, your version of GCC is too old - please use 5.1 or newer. #endif /* From b6d8878d24e39f213df0f3ea7abebd15edc7be21 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 14 Jan 2021 12:48:12 +0000 Subject: [PATCH 174/241] arm64: syscall: include prototype for EL0 SVC functions The kbuild test robot reports that when building with W=1, GCC will warn for a couple of missing prototypes in syscall.c: | arch/arm64/kernel/syscall.c:157:6: warning: no previous prototype for 'do_el0_svc' [-Wmissing-prototypes] | 157 | void do_el0_svc(struct pt_regs *regs) | | ^~~~~~~~~~ | arch/arm64/kernel/syscall.c:164:6: warning: no previous prototype for 'do_el0_svc_compat' [-Wmissing-prototypes] | 164 | void do_el0_svc_compat(struct pt_regs *regs) | | ^~~~~~~~~~~~~~~~~ While this isn't a functional problem, as a general policy we should include the prototype for functions wherever possible to catch any accidental divergence between the prototype and implementation. Here we can easily include , so let's do so. While there are a number of warnings elsewhere and some warnings enabled under W=1 are of questionable benefit, this change helps to make the code more robust as it evolved and reduces the noise somewhat, so it seems worthwhile. Signed-off-by: Mark Rutland Reported-by: kernel test robot Cc: Will Deacon Link: https://lore.kernel.org/r/202101141046.n8iPO3mw-lkp@intel.com Link: https://lore.kernel.org/r/20210114124812.17754-1-mark.rutland@arm.com Signed-off-by: Catalin Marinas --- arch/arm64/kernel/syscall.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index 0bfac95fe464..c2877c332f2d 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include From 3a57a643a851dbb1c4a1819394ca009e3bfa4813 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Fri, 8 Jan 2021 18:31:44 +0000 Subject: [PATCH 175/241] arm64: selftests: Fix spelling of 'Mismatch' The SVE and FPSIMD stress tests have a spelling mistake in the output, fix it. Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20210108183144.673-1-broonie@kernel.org Signed-off-by: Catalin Marinas --- tools/testing/selftests/arm64/fp/fpsimd-test.S | 2 +- tools/testing/selftests/arm64/fp/sve-test.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/arm64/fp/fpsimd-test.S b/tools/testing/selftests/arm64/fp/fpsimd-test.S index 1c5556bdd11d..0dbd594c2747 100644 --- a/tools/testing/selftests/arm64/fp/fpsimd-test.S +++ b/tools/testing/selftests/arm64/fp/fpsimd-test.S @@ -457,7 +457,7 @@ function barf mov x11, x1 // actual data mov x12, x2 // data size - puts "Mistatch: PID=" + puts "Mismatch: PID=" mov x0, x20 bl putdec puts ", iteration=" diff --git a/tools/testing/selftests/arm64/fp/sve-test.S b/tools/testing/selftests/arm64/fp/sve-test.S index f95074c9b48b..9210691aa998 100644 --- a/tools/testing/selftests/arm64/fp/sve-test.S +++ b/tools/testing/selftests/arm64/fp/sve-test.S @@ -625,7 +625,7 @@ function barf mov x11, x1 // actual data mov x12, x2 // data size - puts "Mistatch: PID=" + puts "Mismatch: PID=" mov x0, x20 bl putdec puts ", iteration=" From 4d163ad79b155c71bf30366dc38f8d2502f78844 Mon Sep 17 00:00:00 2001 From: Michael Hennerich Date: Thu, 14 Jan 2021 17:42:17 +0200 Subject: [PATCH 176/241] spi: cadence: cache reference clock rate during probe The issue is that using SPI from a callback under the CCF lock will deadlock, since this code uses clk_get_rate(). Fixes: c474b38665463 ("spi: Add driver for Cadence SPI controller") Signed-off-by: Michael Hennerich Signed-off-by: Alexandru Ardelean Link: https://lore.kernel.org/r/20210114154217.51996-1-alexandru.ardelean@analog.com Signed-off-by: Mark Brown --- drivers/spi/spi-cadence.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/spi/spi-cadence.c b/drivers/spi/spi-cadence.c index 70467b9d61ba..a3afd1b9ac56 100644 --- a/drivers/spi/spi-cadence.c +++ b/drivers/spi/spi-cadence.c @@ -115,6 +115,7 @@ struct cdns_spi { void __iomem *regs; struct clk *ref_clk; struct clk *pclk; + unsigned int clk_rate; u32 speed_hz; const u8 *txbuf; u8 *rxbuf; @@ -250,7 +251,7 @@ static void cdns_spi_config_clock_freq(struct spi_device *spi, u32 ctrl_reg, baud_rate_val; unsigned long frequency; - frequency = clk_get_rate(xspi->ref_clk); + frequency = xspi->clk_rate; ctrl_reg = cdns_spi_read(xspi, CDNS_SPI_CR); @@ -558,8 +559,9 @@ static int cdns_spi_probe(struct platform_device *pdev) master->auto_runtime_pm = true; master->mode_bits = SPI_CPOL | SPI_CPHA | SPI_CS_HIGH; + xspi->clk_rate = clk_get_rate(xspi->ref_clk); /* Set to default valid value */ - master->max_speed_hz = clk_get_rate(xspi->ref_clk) / 4; + master->max_speed_hz = xspi->clk_rate / 4; xspi->speed_hz = master->max_speed_hz; master->bits_per_word_mask = SPI_BPW_MASK(8); From f010505b78a4fa8d5b6480752566e7313fb5ca6e Mon Sep 17 00:00:00 2001 From: Marcelo Diop-Gonzalez Date: Fri, 15 Jan 2021 11:54:40 -0500 Subject: [PATCH 177/241] io_uring: flush timeouts that should already have expired Right now io_flush_timeouts() checks if the current number of events is equal to ->timeout.target_seq, but this will miss some timeouts if there have been more than 1 event added since the last time they were flushed (possible in io_submit_flush_completions(), for example). Fix it by recording the last sequence at which timeouts were flushed so that the number of events seen can be compared to the number of events needed without overflow. Signed-off-by: Marcelo Diop-Gonzalez Reviewed-by: Pavel Begunkov Signed-off-by: Jens Axboe --- fs/io_uring.c | 34 ++++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/io_uring.c b/fs/io_uring.c index 372be9caf340..06cc79d39586 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -354,6 +354,7 @@ struct io_ring_ctx { unsigned cq_entries; unsigned cq_mask; atomic_t cq_timeouts; + unsigned cq_last_tm_flush; unsigned long cq_check_overflow; struct wait_queue_head cq_wait; struct fasync_struct *cq_fasync; @@ -1639,19 +1640,38 @@ static void __io_queue_deferred(struct io_ring_ctx *ctx) static void io_flush_timeouts(struct io_ring_ctx *ctx) { - while (!list_empty(&ctx->timeout_list)) { + u32 seq; + + if (list_empty(&ctx->timeout_list)) + return; + + seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); + + do { + u32 events_needed, events_got; struct io_kiocb *req = list_first_entry(&ctx->timeout_list, struct io_kiocb, timeout.list); if (io_is_timeout_noseq(req)) break; - if (req->timeout.target_seq != ctx->cached_cq_tail - - atomic_read(&ctx->cq_timeouts)) + + /* + * Since seq can easily wrap around over time, subtract + * the last seq at which timeouts were flushed before comparing. + * Assuming not more than 2^31-1 events have happened since, + * these subtractions won't have wrapped, so we can check if + * target is in [last_seq, current_seq] by comparing the two. + */ + events_needed = req->timeout.target_seq - ctx->cq_last_tm_flush; + events_got = seq - ctx->cq_last_tm_flush; + if (events_got < events_needed) break; list_del_init(&req->timeout.list); io_kill_timeout(req); - } + } while (!list_empty(&ctx->timeout_list)); + + ctx->cq_last_tm_flush = seq; } static void io_commit_cqring(struct io_ring_ctx *ctx) @@ -5837,6 +5857,12 @@ static int io_timeout(struct io_kiocb *req) tail = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts); req->timeout.target_seq = tail + off; + /* Update the last seq here in case io_flush_timeouts() hasn't. + * This is safe because ->completion_lock is held, and submissions + * and completions are never mixed in the same ->completion_lock section. + */ + ctx->cq_last_tm_flush = tail; + /* * Insertion sort, ensuring the first entry in the list is always * the one we need first. From 301f0203e04293c13372c032198665bd75adf81b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 29 Dec 2020 15:41:19 -0300 Subject: [PATCH 178/241] perf bpf examples: Fix bpf.h header include directive in 5sec.c example It was looking at bpf/bpf.h, which caused this problem: # perf trace -e tools/perf/examples/bpf/5sec.c /home/acme/git/perf/tools/perf/examples/bpf/5sec.c:42:10: fatal error: 'bpf/bpf.h' file not found #include ^~~~~~~~~~~ 1 error generated. ERROR: unable to compile tools/perf/examples/bpf/5sec.c Hint: Check error message shown above. Hint: You can also pre-compile it into .o using: clang -target bpf -O2 -c tools/perf/examples/bpf/5sec.c with proper -I and -D options. event syntax error: 'tools/perf/examples/bpf/5sec.c' \___ Failed to load tools/perf/examples/bpf/5sec.c from source: Error when compiling BPF scriptlet # Change that to plain bpf.h, to make it work again: # perf trace -e tools/perf/examples/bpf/5sec.c sleep 5s 0.000 perf_bpf_probe:hrtimer_nanosleep(__probe_ip: -1776891872, rqtp: 5000000000) # perf trace -e tools/perf/examples/bpf/5sec.c/max-stack=16/ sleep 5s 0.000 perf_bpf_probe:hrtimer_nanosleep(__probe_ip: -1776891872, rqtp: 5000000000) hrtimer_nanosleep ([kernel.kallsyms]) common_nsleep ([kernel.kallsyms]) __x64_sys_clock_nanosleep ([kernel.kallsyms]) do_syscall_64 ([kernel.kallsyms]) entry_SYSCALL_64_after_hwframe ([kernel.kallsyms]) __clock_nanosleep_2 (/usr/lib64/libc-2.32.so) # perf trace -e tools/perf/examples/bpf/5sec.c sleep 4s # Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/examples/bpf/5sec.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/examples/bpf/5sec.c b/tools/perf/examples/bpf/5sec.c index 65c4ff6892d9..e6b6181c6dc6 100644 --- a/tools/perf/examples/bpf/5sec.c +++ b/tools/perf/examples/bpf/5sec.c @@ -39,7 +39,7 @@ Copyright (C) 2018 Red Hat, Inc., Arnaldo Carvalho de Melo */ -#include +#include #define NSEC_PER_SEC 1000000000L From 38c53947a7dcb6d295769830c9085b0409921ec9 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jan 2021 12:26:08 -0300 Subject: [PATCH 179/241] tools headers UAPI: Sync kvm.h headers with the kernel sources To pick the changes in: 647daca25d24fb6e ("KVM: SVM: Add support for booting APs in an SEV-ES guest") That don't cause any tooling change, just silences this perf build warning: Warning: Kernel ABI header at 'tools/include/uapi/linux/kvm.h' differs from latest version at 'include/uapi/linux/kvm.h' diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Paolo Bonzini Cc: Tom Lendacky Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 886802b8ffba..374c67875cdb 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -251,6 +251,7 @@ struct kvm_hyperv_exit { #define KVM_EXIT_X86_RDMSR 29 #define KVM_EXIT_X86_WRMSR 30 #define KVM_EXIT_DIRTY_RING_FULL 31 +#define KVM_EXIT_AP_RESET_HOLD 32 /* For KVM_EXIT_INTERNAL_ERROR */ /* Emulate instruction failed. */ @@ -573,6 +574,7 @@ struct kvm_vapic_addr { #define KVM_MP_STATE_CHECK_STOP 6 #define KVM_MP_STATE_OPERATING 7 #define KVM_MP_STATE_LOAD 8 +#define KVM_MP_STATE_AP_RESET_HOLD 9 struct kvm_mp_state { __u32 mp_state; From addbdff24293ef772a1b8e5d127b570e70f08cdc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 Jan 2021 12:35:27 -0300 Subject: [PATCH 180/241] tools headers: Syncronize linux/build_bug.h with the kernel sources To pick up the changes in: 3a176b94609a18f5 ("Revert "kbuild: avoid static_assert for genksyms"") And silence this perf build warning: Warning: Kernel ABI header at 'tools/include/linux/build_bug.h' differs from latest version at 'include/linux/build_bug.h' diff -u tools/include/linux/build_bug.h include/linux/build_bug.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Masahiro Yamada Cc: Namhyung Kim Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/linux/build_bug.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tools/include/linux/build_bug.h b/tools/include/linux/build_bug.h index ce365d212768..cc7070c7439b 100644 --- a/tools/include/linux/build_bug.h +++ b/tools/include/linux/build_bug.h @@ -79,9 +79,4 @@ #define __static_assert(expr, msg, ...) _Static_assert(expr, msg) #endif // static_assert -#ifdef __GENKSYMS__ -/* genksyms gets confused by _Static_assert */ -#define _Static_assert(expr, ...) -#endif - #endif /* _LINUX_BUILD_BUG_H */ From a042a82ddbb3434f523c0671f5301d1fe796b4eb Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 14 Jan 2021 14:06:09 +0900 Subject: [PATCH 181/241] perf test: Fix shadow stat test for non-bash shells It was using some bash-specific features and failed to parse when running with a different shell like below: root@kbl-ppc:~/kbl-ws/perf-dev/lck-9077/acme.tmp/tools/perf# ./perf test 83 -vv 83: perf stat metrics (shadow stat) test : --- start --- test child forked, pid 3922 ./tests/shell/stat+shadow_stat.sh: 19: ./tests/shell/stat+shadow_stat.sh: [[: not found ./tests/shell/stat+shadow_stat.sh: 24: ./tests/shell/stat+shadow_stat.sh: [[: not found ./tests/shell/stat+shadow_stat.sh: 30: ./tests/shell/stat+shadow_stat.sh: [[: not found (standard_in) 2: syntax error ./tests/shell/stat+shadow_stat.sh: 36: ./tests/shell/stat+shadow_stat.sh: [[: not found ./tests/shell/stat+shadow_stat.sh: 19: ./tests/shell/stat+shadow_stat.sh: [[: not found ./tests/shell/stat+shadow_stat.sh: 24: ./tests/shell/stat+shadow_stat.sh: [[: not found ./tests/shell/stat+shadow_stat.sh: 30: ./tests/shell/stat+shadow_stat.sh: [[: not found (standard_in) 2: syntax error ./tests/shell/stat+shadow_stat.sh: 36: ./tests/shell/stat+shadow_stat.sh: [[: not found ./tests/shell/stat+shadow_stat.sh: 45: ./tests/shell/stat+shadow_stat.sh: declare: not found test child finished with -1 ---- end ---- perf stat metrics (shadow stat) test: FAILED! Reported-by: Jin Yao Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: David Laight Cc: Ian Rogers Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210114050609.1258820-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/shell/stat+shadow_stat.sh | 30 ++++++++++------------ 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/tools/perf/tests/shell/stat+shadow_stat.sh b/tools/perf/tests/shell/stat+shadow_stat.sh index 249dfe48cf6a..ebebd3596cf9 100755 --- a/tools/perf/tests/shell/stat+shadow_stat.sh +++ b/tools/perf/tests/shell/stat+shadow_stat.sh @@ -9,31 +9,29 @@ perf stat -a true > /dev/null 2>&1 || exit 2 test_global_aggr() { - local cyc - perf stat -a --no-big-num -e cycles,instructions sleep 1 2>&1 | \ grep -e cycles -e instructions | \ while read num evt hash ipc rest do # skip not counted events - if [[ $num == "&1 | \ grep ^CPU | \ while read cpu num evt hash ipc rest do # skip not counted events - if [[ $num == " Date: Fri, 15 Jan 2021 06:10:04 -0800 Subject: [PATCH 182/241] octeontx2-af: Fix missing check bugs in rvu_cgx.c In rvu_mbox_handler_cgx_mac_addr_get() and rvu_mbox_handler_cgx_mac_addr_set(), the msg is expected only from PFs that are mapped to CGX LMACs. It should be checked before mapping, so we add the is_cgx_config_permitted() in the functions. Fixes: 96be2e0da85e ("octeontx2-af: Support for MAC address filters in CGX") Signed-off-by: Yingjie Wang Reviewed-by: Geetha sowjanya Link: https://lore.kernel.org/r/1610719804-35230-1-git-send-email-wangyingjie55@126.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c index d298b9357177..6c6b411e78fd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cgx.c @@ -469,6 +469,9 @@ int rvu_mbox_handler_cgx_mac_addr_set(struct rvu *rvu, int pf = rvu_get_pf(req->hdr.pcifunc); u8 cgx_id, lmac_id; + if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc)) + return -EPERM; + rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); cgx_lmac_addr_set(cgx_id, lmac_id, req->mac_addr); @@ -485,6 +488,9 @@ int rvu_mbox_handler_cgx_mac_addr_get(struct rvu *rvu, int rc = 0, i; u64 cfg; + if (!is_cgx_config_permitted(rvu, req->hdr.pcifunc)) + return -EPERM; + rvu_get_cgx_lmac_id(rvu->pf2cgxlmac_map[pf], &cgx_id, &lmac_id); rsp->hdr.rc = rc; From 23dd561ad9eae02b4d51bb502fe4e1a0666e9567 Mon Sep 17 00:00:00 2001 From: Yi Li Date: Wed, 30 Dec 2020 11:38:27 +0800 Subject: [PATCH 183/241] ext4: use IS_ERR instead of IS_ERR_OR_NULL and set inode null when IS_ERR 1: ext4_iget/ext4_find_extent never returns NULL, use IS_ERR instead of IS_ERR_OR_NULL to fix this. 2: ext4_fc_replay_inode should set the inode to NULL when IS_ERR. and go to call iput properly. Fixes: 8016e29f4362 ("ext4: fast commit recovery path") Signed-off-by: Yi Li Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20201230033827.3996064-1-yili@winhong.com Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 4fcc21c25e79..6b5489273c85 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1318,14 +1318,14 @@ static int ext4_fc_replay_unlink(struct super_block *sb, struct ext4_fc_tl *tl) entry.len = darg.dname_len; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "Inode %d not found", darg.ino); return 0; } old_parent = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(old_parent)) { + if (IS_ERR(old_parent)) { jbd_debug(1, "Dir with inode %d not found", darg.parent_ino); iput(inode); return 0; @@ -1410,7 +1410,7 @@ static int ext4_fc_replay_link(struct super_block *sb, struct ext4_fc_tl *tl) darg.parent_ino, darg.dname_len); inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "Inode not found."); return 0; } @@ -1466,10 +1466,11 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) trace_ext4_fc_replay(sb, tag, ino, 0, 0); inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); - if (!IS_ERR_OR_NULL(inode)) { + if (!IS_ERR(inode)) { ext4_ext_clear_bb(inode); iput(inode); } + inode = NULL; ext4_fc_record_modified_inode(sb, ino); @@ -1512,7 +1513,7 @@ static int ext4_fc_replay_inode(struct super_block *sb, struct ext4_fc_tl *tl) /* Given that we just wrote the inode on disk, this SHOULD succeed. */ inode = ext4_iget(sb, ino, EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "Inode not found."); return -EFSCORRUPTED; } @@ -1564,7 +1565,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) goto out; inode = ext4_iget(sb, darg.ino, EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "inode %d not found.", darg.ino); inode = NULL; ret = -EINVAL; @@ -1577,7 +1578,7 @@ static int ext4_fc_replay_create(struct super_block *sb, struct ext4_fc_tl *tl) * dot and dot dot dirents are setup properly. */ dir = ext4_iget(sb, darg.parent_ino, EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(dir)) { + if (IS_ERR(dir)) { jbd_debug(1, "Dir %d not found.", darg.ino); goto out; } @@ -1653,7 +1654,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, inode = ext4_iget(sb, le32_to_cpu(fc_add_ex->fc_ino), EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "Inode not found."); return 0; } @@ -1777,7 +1778,7 @@ ext4_fc_replay_del_range(struct super_block *sb, struct ext4_fc_tl *tl) le32_to_cpu(lrange->fc_ino), cur, remaining); inode = ext4_iget(sb, le32_to_cpu(lrange->fc_ino), EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "Inode %d not found", le32_to_cpu(lrange->fc_ino)); return 0; } @@ -1832,7 +1833,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) for (i = 0; i < state->fc_modified_inodes_used; i++) { inode = ext4_iget(sb, state->fc_modified_inodes[i], EXT4_IGET_NORMAL); - if (IS_ERR_OR_NULL(inode)) { + if (IS_ERR(inode)) { jbd_debug(1, "Inode %d not found.", state->fc_modified_inodes[i]); continue; @@ -1849,7 +1850,7 @@ static void ext4_fc_set_bitmaps_and_counters(struct super_block *sb) if (ret > 0) { path = ext4_find_extent(inode, map.m_lblk, NULL, 0); - if (!IS_ERR_OR_NULL(path)) { + if (!IS_ERR(path)) { for (j = 0; j < path->p_depth; j++) ext4_mb_mark_bb(inode->i_sb, path[j].p_block, 1, 1); From 31e203e09f036f48e7c567c2d32df0196bbd303f Mon Sep 17 00:00:00 2001 From: Daejun Park Date: Wed, 30 Dec 2020 18:48:51 +0900 Subject: [PATCH 184/241] ext4: fix wrong list_splice in ext4_fc_cleanup After full/fast commit, entries in staging queue are promoted to main queue. In ext4_fs_cleanup function, it splice to staging queue to staging queue. Fixes: aa75f4d3daaeb ("ext4: main fast-commit commit path") Signed-off-by: Daejun Park Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20201230094851epcms2p6eeead8cc984379b37b2efd21af90fd1a@epcms2p6 Signed-off-by: Theodore Ts'o Cc: stable@kernel.org --- fs/ext4/fast_commit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 6b5489273c85..ec5316fbcd1d 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -1268,7 +1268,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full) list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], &sbi->s_fc_dentry_q[FC_Q_MAIN]); list_splice_init(&sbi->s_fc_q[FC_Q_STAGING], - &sbi->s_fc_q[FC_Q_STAGING]); + &sbi->s_fc_q[FC_Q_MAIN]); ext4_clear_mount_flag(sb, EXT4_MF_FC_COMMITTING); ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); From 6b4b8e6b4ad8553660421d6360678b3811d5deb9 Mon Sep 17 00:00:00 2001 From: yangerkun Date: Tue, 5 Jan 2021 14:28:57 +0800 Subject: [PATCH 185/241] ext4: fix bug for rename with RENAME_WHITEOUT We got a "deleted inode referenced" warning cross our fsstress test. The bug can be reproduced easily with following steps: cd /dev/shm mkdir test/ fallocate -l 128M img mkfs.ext4 -b 1024 img mount img test/ dd if=/dev/zero of=test/foo bs=1M count=128 mkdir test/dir/ && cd test/dir/ for ((i=0;i<1000;i++)); do touch file$i; done # consume all block cd ~ && renameat2(AT_FDCWD, /dev/shm/test/dir/file1, AT_FDCWD, /dev/shm/test/dir/dst_file, RENAME_WHITEOUT) # ext4_add_entry in ext4_rename will return ENOSPC!! cd /dev/shm/ && umount test/ && mount img test/ && ls -li test/dir/file1 We will get the output: "ls: cannot access 'test/dir/file1': Structure needs cleaning" and the dmesg show: "EXT4-fs error (device loop0): ext4_lookup:1626: inode #2049: comm ls: deleted inode referenced: 139" ext4_rename will create a special inode for whiteout and use this 'ino' to replace the source file's dir entry 'ino'. Once error happens latter(the error above was the ENOSPC return from ext4_add_entry in ext4_rename since all space has been consumed), the cleanup do drop the nlink for whiteout, but forget to restore 'ino' with source file. This will trigger the bug describle as above. Signed-off-by: yangerkun Reviewed-by: Jan Kara Cc: stable@vger.kernel.org Fixes: cd808deced43 ("ext4: support RENAME_WHITEOUT") Link: https://lore.kernel.org/r/20210105062857.3566-1-yangerkun@huawei.com Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a3b28ef2455a..fa625a247e9a 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3601,9 +3601,6 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, return retval2; } } - brelse(ent->bh); - ent->bh = NULL; - return retval; } @@ -3802,6 +3799,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } } + old_file_type = old.de->file_type; if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); @@ -3829,7 +3827,6 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); - old_file_type = old.de->file_type; if (whiteout) { /* * Do this before adding a new entry, so the old entry is sure @@ -3927,15 +3924,19 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, retval = 0; end_rename: + if (whiteout) { + if (retval) { + ext4_setent(handle, &old, + old.inode->i_ino, old_file_type); + drop_nlink(whiteout); + } + unlock_new_inode(whiteout); + iput(whiteout); + + } brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); - if (whiteout) { - if (retval) - drop_nlink(whiteout); - unlock_new_inode(whiteout); - iput(whiteout); - } if (handle) ext4_journal_stop(handle); return retval; From e9f53353e166a67dfe4f8295100f8ac39d6cf10b Mon Sep 17 00:00:00 2001 From: Daejun Park Date: Wed, 6 Jan 2021 10:32:42 +0900 Subject: [PATCH 186/241] ext4: remove expensive flush on fast commit In the fast commit, it adds REQ_FUA and REQ_PREFLUSH on each fast commit block when barrier is enabled. However, in recovery phase, ext4 compares CRC value in the tail. So it is sufficient to add REQ_FUA and REQ_PREFLUSH on the block that has tail. Signed-off-by: Daejun Park Reviewed-by: Harshad Shirwadkar Link: https://lore.kernel.org/r/20210106013242epcms2p5b6b4ed8ca86f29456fdf56aa580e74b4@epcms2p5 Signed-off-by: Theodore Ts'o --- fs/ext4/fast_commit.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index ec5316fbcd1d..0a14a7c87bf8 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -604,13 +604,13 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star trace_ext4_fc_track_range(inode, start, end, ret); } -static void ext4_fc_submit_bh(struct super_block *sb) +static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { int write_flags = REQ_SYNC; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; - /* TODO: REQ_FUA | REQ_PREFLUSH is unnecessarily expensive. */ - if (test_opt(sb, BARRIER)) + /* Add REQ_FUA | REQ_PREFLUSH only its tail */ + if (test_opt(sb, BARRIER) && is_tail) write_flags |= REQ_FUA | REQ_PREFLUSH; lock_buffer(bh); set_buffer_dirty(bh); @@ -684,7 +684,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) *crc = ext4_chksum(sbi, *crc, tl, sizeof(*tl)); if (pad_len > 0) ext4_fc_memzero(sb, tl + 1, pad_len, crc); - ext4_fc_submit_bh(sb); + ext4_fc_submit_bh(sb, false); ret = jbd2_fc_get_buf(EXT4_SB(sb)->s_journal, &bh); if (ret) @@ -741,7 +741,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_crc = cpu_to_le32(crc); ext4_fc_memcpy(sb, dst, &tail.fc_crc, sizeof(tail.fc_crc), NULL); - ext4_fc_submit_bh(sb); + ext4_fc_submit_bh(sb, true); return 0; } From be82fddca81eefd1edbd9b290dfcb2177e24785b Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 14 Jan 2021 13:23:04 -0800 Subject: [PATCH 187/241] libperf tests: Avoid uninitialized variable warning The variable 'bf' is read (for a write call) without being initialized triggering a memory sanitizer warning. Use 'bf' in the read and switch the write to reading from a string. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Acked-by: Namhyung Kim Cc: Alexander Shishkin Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210114212304.4018119-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/tests/test-evlist.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index 6d8ebe0c2504..1b225fe34a72 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -208,7 +208,6 @@ static int test_mmap_thread(void) char path[PATH_MAX]; int id, err, pid, go_pipe[2]; union perf_event *event; - char bf; int count = 0; snprintf(path, PATH_MAX, "%s/kernel/debug/tracing/events/syscalls/sys_enter_prctl/id", @@ -229,6 +228,7 @@ static int test_mmap_thread(void) pid = fork(); if (!pid) { int i; + char bf; read(go_pipe[0], &bf, 1); @@ -266,7 +266,7 @@ static int test_mmap_thread(void) perf_evlist__enable(evlist); /* kick the child and wait for it to finish */ - write(go_pipe[1], &bf, 1); + write(go_pipe[1], "A", 1); waitpid(pid, NULL, 0); /* From bba2ea17ef553aea0df80cb64399fe2f70f225dd Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 14 Jan 2021 10:02:49 -0800 Subject: [PATCH 188/241] libperf tests: If a test fails return non-zero If a test fails return -1 rather than 0. This is consistent with the return value in test-cpumap.c Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210114180250.3853825-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/tests/test-cpumap.c | 2 +- tools/lib/perf/tests/test-evlist.c | 2 +- tools/lib/perf/tests/test-evsel.c | 2 +- tools/lib/perf/tests/test-threadmap.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/lib/perf/tests/test-cpumap.c b/tools/lib/perf/tests/test-cpumap.c index c8d45091e7c2..c70e9e03af3e 100644 --- a/tools/lib/perf/tests/test-cpumap.c +++ b/tools/lib/perf/tests/test-cpumap.c @@ -27,5 +27,5 @@ int main(int argc, char **argv) perf_cpu_map__put(cpus); __T_END; - return 0; + return tests_failed == 0 ? 0 : -1; } diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index 1b225fe34a72..220a95be47c3 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -409,5 +409,5 @@ int main(int argc, char **argv) test_mmap_cpus(); __T_END; - return 0; + return tests_failed == 0 ? 0 : -1; } diff --git a/tools/lib/perf/tests/test-evsel.c b/tools/lib/perf/tests/test-evsel.c index 135722ac965b..0ad82d7a2a51 100644 --- a/tools/lib/perf/tests/test-evsel.c +++ b/tools/lib/perf/tests/test-evsel.c @@ -131,5 +131,5 @@ int main(int argc, char **argv) test_stat_thread_enable(); __T_END; - return 0; + return tests_failed == 0 ? 0 : -1; } diff --git a/tools/lib/perf/tests/test-threadmap.c b/tools/lib/perf/tests/test-threadmap.c index 7dc4d6fbedde..384471441b48 100644 --- a/tools/lib/perf/tests/test-threadmap.c +++ b/tools/lib/perf/tests/test-threadmap.c @@ -27,5 +27,5 @@ int main(int argc, char **argv) perf_thread_map__put(threads); __T_END; - return 0; + return tests_failed == 0 ? 0 : -1; } From 66dd86b2a2bee129c70f7ff054d3a6a2e5f8eb20 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 14 Jan 2021 10:02:50 -0800 Subject: [PATCH 189/241] libperf tests: Fail when failing to get a tracepoint id Permissions are necessary to get a tracepoint id. Fail the test when the read fails. Signed-off-by: Ian Rogers Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210114180250.3853825-2-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/perf/tests/test-evlist.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/lib/perf/tests/test-evlist.c b/tools/lib/perf/tests/test-evlist.c index 220a95be47c3..e2ac0b7f432e 100644 --- a/tools/lib/perf/tests/test-evlist.c +++ b/tools/lib/perf/tests/test-evlist.c @@ -214,6 +214,7 @@ static int test_mmap_thread(void) sysfs__mountpoint()); if (filename__read_int(path, &id)) { + tests_failed++; fprintf(stderr, "error: failed to get tracepoint id: %s\n", path); return -1; } From 3ff1e7180abc7f6db413933c110df69157216715 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 15 Jan 2021 16:11:38 +0900 Subject: [PATCH 190/241] perf stat: Introduce struct runtime_stat_data To pass more info to the saved_value in the runtime_stat, add a new struct runtime_stat_data. Currently it only has 'ctx' field but later patch will add more. Note that we intentionally pass 0 as ctx to clock-related events for compatibility. It was already there in a few places. So move the code into the saved_value_lookup() explicitly and add a comment. Suggested-by: Andi Kleen Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jin Yao Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210115071139.257042-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 346 +++++++++++++++++----------------- 1 file changed, 173 insertions(+), 173 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index 901265127e36..a1565b6e38f2 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -114,6 +114,10 @@ static struct saved_value *saved_value_lookup(struct evsel *evsel, rblist = &st->value_list; + /* don't use context info for clock events */ + if (type == STAT_NSECS) + dm.ctx = 0; + nd = rblist__find(rblist, &dm); if (nd) return container_of(nd, struct saved_value, rb_node); @@ -191,12 +195,17 @@ void perf_stat__reset_shadow_per_stat(struct runtime_stat *st) reset_stat(st); } +struct runtime_stat_data { + int ctx; +}; + static void update_runtime_stat(struct runtime_stat *st, enum stat_type type, - int ctx, int cpu, u64 count) + int cpu, u64 count, + struct runtime_stat_data *rsd) { - struct saved_value *v = saved_value_lookup(NULL, cpu, true, - type, ctx, st); + struct saved_value *v = saved_value_lookup(NULL, cpu, true, type, + rsd->ctx, st); if (v) update_stats(&v->stats, count); @@ -210,73 +219,75 @@ static void update_runtime_stat(struct runtime_stat *st, void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, int cpu, struct runtime_stat *st) { - int ctx = evsel_context(counter); u64 count_ns = count; struct saved_value *v; + struct runtime_stat_data rsd = { + .ctx = evsel_context(counter), + }; count *= counter->scale; if (evsel__is_clock(counter)) - update_runtime_stat(st, STAT_NSECS, 0, cpu, count_ns); + update_runtime_stat(st, STAT_NSECS, cpu, count_ns, &rsd); else if (evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_runtime_stat(st, STAT_CYCLES, ctx, cpu, count); + update_runtime_stat(st, STAT_CYCLES, cpu, count, &rsd); else if (perf_stat_evsel__is(counter, CYCLES_IN_TX)) - update_runtime_stat(st, STAT_CYCLES_IN_TX, ctx, cpu, count); + update_runtime_stat(st, STAT_CYCLES_IN_TX, cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TRANSACTION_START)) - update_runtime_stat(st, STAT_TRANSACTION, ctx, cpu, count); + update_runtime_stat(st, STAT_TRANSACTION, cpu, count, &rsd); else if (perf_stat_evsel__is(counter, ELISION_START)) - update_runtime_stat(st, STAT_ELISION, ctx, cpu, count); + update_runtime_stat(st, STAT_ELISION, cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_TOTAL_SLOTS)) update_runtime_stat(st, STAT_TOPDOWN_TOTAL_SLOTS, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_ISSUED)) update_runtime_stat(st, STAT_TOPDOWN_SLOTS_ISSUED, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_SLOTS_RETIRED)) update_runtime_stat(st, STAT_TOPDOWN_SLOTS_RETIRED, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_FETCH_BUBBLES)) update_runtime_stat(st, STAT_TOPDOWN_FETCH_BUBBLES, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_RECOVERY_BUBBLES)) update_runtime_stat(st, STAT_TOPDOWN_RECOVERY_BUBBLES, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_RETIRING)) update_runtime_stat(st, STAT_TOPDOWN_RETIRING, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_BAD_SPEC)) update_runtime_stat(st, STAT_TOPDOWN_BAD_SPEC, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_FE_BOUND)) update_runtime_stat(st, STAT_TOPDOWN_FE_BOUND, - ctx, cpu, count); + cpu, count, &rsd); else if (perf_stat_evsel__is(counter, TOPDOWN_BE_BOUND)) update_runtime_stat(st, STAT_TOPDOWN_BE_BOUND, - ctx, cpu, count); + cpu, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_FRONT, - ctx, cpu, count); + cpu, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) update_runtime_stat(st, STAT_STALLED_CYCLES_BACK, - ctx, cpu, count); + cpu, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_runtime_stat(st, STAT_BRANCHES, ctx, cpu, count); + update_runtime_stat(st, STAT_BRANCHES, cpu, count, &rsd); else if (evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) - update_runtime_stat(st, STAT_CACHEREFS, ctx, cpu, count); + update_runtime_stat(st, STAT_CACHEREFS, cpu, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) - update_runtime_stat(st, STAT_L1_DCACHE, ctx, cpu, count); + update_runtime_stat(st, STAT_L1_DCACHE, cpu, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_L1I)) - update_runtime_stat(st, STAT_L1_ICACHE, ctx, cpu, count); + update_runtime_stat(st, STAT_L1_ICACHE, cpu, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_LL)) - update_runtime_stat(st, STAT_LL_CACHE, ctx, cpu, count); + update_runtime_stat(st, STAT_LL_CACHE, cpu, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_DTLB)) - update_runtime_stat(st, STAT_DTLB_CACHE, ctx, cpu, count); + update_runtime_stat(st, STAT_DTLB_CACHE, cpu, count, &rsd); else if (evsel__match(counter, HW_CACHE, HW_CACHE_ITLB)) - update_runtime_stat(st, STAT_ITLB_CACHE, ctx, cpu, count); + update_runtime_stat(st, STAT_ITLB_CACHE, cpu, count, &rsd); else if (perf_stat_evsel__is(counter, SMI_NUM)) - update_runtime_stat(st, STAT_SMI_NUM, ctx, cpu, count); + update_runtime_stat(st, STAT_SMI_NUM, cpu, count, &rsd); else if (perf_stat_evsel__is(counter, APERF)) - update_runtime_stat(st, STAT_APERF, ctx, cpu, count); + update_runtime_stat(st, STAT_APERF, cpu, count, &rsd); if (counter->collect_stat) { v = saved_value_lookup(counter, cpu, true, STAT_NONE, 0, st); @@ -422,11 +433,12 @@ void perf_stat__collect_metric_expr(struct evlist *evsel_list) } static double runtime_stat_avg(struct runtime_stat *st, - enum stat_type type, int ctx, int cpu) + enum stat_type type, int cpu, + struct runtime_stat_data *rsd) { struct saved_value *v; - v = saved_value_lookup(NULL, cpu, false, type, ctx, st); + v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st); if (!v) return 0.0; @@ -434,11 +446,12 @@ static double runtime_stat_avg(struct runtime_stat *st, } static double runtime_stat_n(struct runtime_stat *st, - enum stat_type type, int ctx, int cpu) + enum stat_type type, int cpu, + struct runtime_stat_data *rsd) { struct saved_value *v; - v = saved_value_lookup(NULL, cpu, false, type, ctx, st); + v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st); if (!v) return 0.0; @@ -446,16 +459,15 @@ static double runtime_stat_n(struct runtime_stat *st, } static void print_stalled_cycles_frontend(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -470,16 +482,15 @@ static void print_stalled_cycles_frontend(struct perf_stat_config *config, } static void print_stalled_cycles_backend(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -490,17 +501,15 @@ static void print_stalled_cycles_backend(struct perf_stat_config *config, } static void print_branch_misses(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, - double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_BRANCHES, ctx, cpu); + total = runtime_stat_avg(st, STAT_BRANCHES, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -511,18 +520,15 @@ static void print_branch_misses(struct perf_stat_config *config, } static void print_l1_dcache_misses(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, - double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) - + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_L1_DCACHE, ctx, cpu); + total = runtime_stat_avg(st, STAT_L1_DCACHE, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -533,18 +539,15 @@ static void print_l1_dcache_misses(struct perf_stat_config *config, } static void print_l1_icache_misses(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, - double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) - + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_L1_ICACHE, ctx, cpu); + total = runtime_stat_avg(st, STAT_L1_ICACHE, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -554,17 +557,15 @@ static void print_l1_icache_misses(struct perf_stat_config *config, } static void print_dtlb_cache_misses(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, - double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_DTLB_CACHE, ctx, cpu); + total = runtime_stat_avg(st, STAT_DTLB_CACHE, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -574,17 +575,15 @@ static void print_dtlb_cache_misses(struct perf_stat_config *config, } static void print_itlb_cache_misses(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, - double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_ITLB_CACHE, ctx, cpu); + total = runtime_stat_avg(st, STAT_ITLB_CACHE, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -594,17 +593,15 @@ static void print_itlb_cache_misses(struct perf_stat_config *config, } static void print_ll_cache_misses(struct perf_stat_config *config, - int cpu, - struct evsel *evsel, - double avg, + int cpu, double avg, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double total, ratio = 0.0; const char *color; - int ctx = evsel_context(evsel); - total = runtime_stat_avg(st, STAT_LL_CACHE, ctx, cpu); + total = runtime_stat_avg(st, STAT_LL_CACHE, cpu, rsd); if (total) ratio = avg / total * 100.0; @@ -662,56 +659,61 @@ static double sanitize_val(double x) return x; } -static double td_total_slots(int ctx, int cpu, struct runtime_stat *st) +static double td_total_slots(int cpu, struct runtime_stat *st, + struct runtime_stat_data *rsd) { - return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, ctx, cpu); + return runtime_stat_avg(st, STAT_TOPDOWN_TOTAL_SLOTS, cpu, rsd); } -static double td_bad_spec(int ctx, int cpu, struct runtime_stat *st) +static double td_bad_spec(int cpu, struct runtime_stat *st, + struct runtime_stat_data *rsd) { double bad_spec = 0; double total_slots; double total; - total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, ctx, cpu) - - runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, ctx, cpu) + - runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, ctx, cpu); + total = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_ISSUED, cpu, rsd) - + runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, cpu, rsd) + + runtime_stat_avg(st, STAT_TOPDOWN_RECOVERY_BUBBLES, cpu, rsd); - total_slots = td_total_slots(ctx, cpu, st); + total_slots = td_total_slots(cpu, st, rsd); if (total_slots) bad_spec = total / total_slots; return sanitize_val(bad_spec); } -static double td_retiring(int ctx, int cpu, struct runtime_stat *st) +static double td_retiring(int cpu, struct runtime_stat *st, + struct runtime_stat_data *rsd) { double retiring = 0; - double total_slots = td_total_slots(ctx, cpu, st); + double total_slots = td_total_slots(cpu, st, rsd); double ret_slots = runtime_stat_avg(st, STAT_TOPDOWN_SLOTS_RETIRED, - ctx, cpu); + cpu, rsd); if (total_slots) retiring = ret_slots / total_slots; return retiring; } -static double td_fe_bound(int ctx, int cpu, struct runtime_stat *st) +static double td_fe_bound(int cpu, struct runtime_stat *st, + struct runtime_stat_data *rsd) { double fe_bound = 0; - double total_slots = td_total_slots(ctx, cpu, st); + double total_slots = td_total_slots(cpu, st, rsd); double fetch_bub = runtime_stat_avg(st, STAT_TOPDOWN_FETCH_BUBBLES, - ctx, cpu); + cpu, rsd); if (total_slots) fe_bound = fetch_bub / total_slots; return fe_bound; } -static double td_be_bound(int ctx, int cpu, struct runtime_stat *st) +static double td_be_bound(int cpu, struct runtime_stat *st, + struct runtime_stat_data *rsd) { - double sum = (td_fe_bound(ctx, cpu, st) + - td_bad_spec(ctx, cpu, st) + - td_retiring(ctx, cpu, st)); + double sum = (td_fe_bound(cpu, st, rsd) + + td_bad_spec(cpu, st, rsd) + + td_retiring(cpu, st, rsd)); if (sum == 0) return 0; return sanitize_val(1.0 - sum); @@ -722,15 +724,15 @@ static double td_be_bound(int ctx, int cpu, struct runtime_stat *st) * the ratios we need to recreate the sum. */ -static double td_metric_ratio(int ctx, int cpu, - enum stat_type type, - struct runtime_stat *stat) +static double td_metric_ratio(int cpu, enum stat_type type, + struct runtime_stat *stat, + struct runtime_stat_data *rsd) { - double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, ctx, cpu) + - runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, ctx, cpu) + - runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, ctx, cpu) + - runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, ctx, cpu); - double d = runtime_stat_avg(stat, type, ctx, cpu); + double sum = runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu, rsd) + + runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu, rsd) + + runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu, rsd) + + runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu, rsd); + double d = runtime_stat_avg(stat, type, cpu, rsd); if (sum) return d / sum; @@ -742,34 +744,33 @@ static double td_metric_ratio(int ctx, int cpu, * We allow two missing. */ -static bool full_td(int ctx, int cpu, - struct runtime_stat *stat) +static bool full_td(int cpu, struct runtime_stat *stat, + struct runtime_stat_data *rsd) { int c = 0; - if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, ctx, cpu) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_RETIRING, cpu, rsd) > 0) c++; - if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, ctx, cpu) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_BE_BOUND, cpu, rsd) > 0) c++; - if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, ctx, cpu) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_FE_BOUND, cpu, rsd) > 0) c++; - if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, ctx, cpu) > 0) + if (runtime_stat_avg(stat, STAT_TOPDOWN_BAD_SPEC, cpu, rsd) > 0) c++; return c >= 2; } -static void print_smi_cost(struct perf_stat_config *config, - int cpu, struct evsel *evsel, +static void print_smi_cost(struct perf_stat_config *config, int cpu, struct perf_stat_output_ctx *out, - struct runtime_stat *st) + struct runtime_stat *st, + struct runtime_stat_data *rsd) { double smi_num, aperf, cycles, cost = 0.0; - int ctx = evsel_context(evsel); const char *color = NULL; - smi_num = runtime_stat_avg(st, STAT_SMI_NUM, ctx, cpu); - aperf = runtime_stat_avg(st, STAT_APERF, ctx, cpu); - cycles = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); + smi_num = runtime_stat_avg(st, STAT_SMI_NUM, cpu, rsd); + aperf = runtime_stat_avg(st, STAT_APERF, cpu, rsd); + cycles = runtime_stat_avg(st, STAT_CYCLES, cpu, rsd); if ((cycles == 0) || (aperf == 0)) return; @@ -930,12 +931,14 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric_t print_metric = out->print_metric; double total, ratio = 0.0, total2; const char *color = NULL; - int ctx = evsel_context(evsel); + struct runtime_stat_data rsd = { + .ctx = evsel_context(evsel), + }; struct metric_event *me; int num = 1; if (evsel__match(evsel, HARDWARE, HW_INSTRUCTIONS)) { - total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); if (total) { ratio = avg / total; @@ -945,12 +948,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "insn per cycle", 0); } - total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, - ctx, cpu); + total = runtime_stat_avg(st, STAT_STALLED_CYCLES_FRONT, cpu, &rsd); total = max(total, runtime_stat_avg(st, STAT_STALLED_CYCLES_BACK, - ctx, cpu)); + cpu, &rsd)); if (total && avg) { out->new_line(config, ctxp); @@ -960,8 +962,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ratio); } } else if (evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES)) { - if (runtime_stat_n(st, STAT_BRANCHES, ctx, cpu) != 0) - print_branch_misses(config, cpu, evsel, avg, out, st); + if (runtime_stat_n(st, STAT_BRANCHES, cpu, &rsd) != 0) + print_branch_misses(config, cpu, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all branches", 0); } else if ( @@ -970,8 +972,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_L1_DCACHE, ctx, cpu) != 0) - print_l1_dcache_misses(config, cpu, evsel, avg, out, st); + if (runtime_stat_n(st, STAT_L1_DCACHE, cpu, &rsd) != 0) + print_l1_dcache_misses(config, cpu, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all L1-dcache accesses", 0); } else if ( @@ -980,8 +982,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_L1_ICACHE, ctx, cpu) != 0) - print_l1_icache_misses(config, cpu, evsel, avg, out, st); + if (runtime_stat_n(st, STAT_L1_ICACHE, cpu, &rsd) != 0) + print_l1_icache_misses(config, cpu, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all L1-icache accesses", 0); } else if ( @@ -990,8 +992,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_DTLB_CACHE, ctx, cpu) != 0) - print_dtlb_cache_misses(config, cpu, evsel, avg, out, st); + if (runtime_stat_n(st, STAT_DTLB_CACHE, cpu, &rsd) != 0) + print_dtlb_cache_misses(config, cpu, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all dTLB cache accesses", 0); } else if ( @@ -1000,8 +1002,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_ITLB_CACHE, ctx, cpu) != 0) - print_itlb_cache_misses(config, cpu, evsel, avg, out, st); + if (runtime_stat_n(st, STAT_ITLB_CACHE, cpu, &rsd) != 0) + print_itlb_cache_misses(config, cpu, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all iTLB cache accesses", 0); } else if ( @@ -1010,27 +1012,27 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16))) { - if (runtime_stat_n(st, STAT_LL_CACHE, ctx, cpu) != 0) - print_ll_cache_misses(config, cpu, evsel, avg, out, st); + if (runtime_stat_n(st, STAT_LL_CACHE, cpu, &rsd) != 0) + print_ll_cache_misses(config, cpu, avg, out, st, &rsd); else print_metric(config, ctxp, NULL, NULL, "of all LL-cache accesses", 0); } else if (evsel__match(evsel, HARDWARE, HW_CACHE_MISSES)) { - total = runtime_stat_avg(st, STAT_CACHEREFS, ctx, cpu); + total = runtime_stat_avg(st, STAT_CACHEREFS, cpu, &rsd); if (total) ratio = avg * 100 / total; - if (runtime_stat_n(st, STAT_CACHEREFS, ctx, cpu) != 0) + if (runtime_stat_n(st, STAT_CACHEREFS, cpu, &rsd) != 0) print_metric(config, ctxp, NULL, "%8.3f %%", "of all cache refs", ratio); else print_metric(config, ctxp, NULL, NULL, "of all cache refs", 0); } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { - print_stalled_cycles_frontend(config, cpu, evsel, avg, out, st); + print_stalled_cycles_frontend(config, cpu, avg, out, st, &rsd); } else if (evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { - print_stalled_cycles_backend(config, cpu, evsel, avg, out, st); + print_stalled_cycles_backend(config, cpu, avg, out, st, &rsd); } else if (evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { - total = runtime_stat_avg(st, STAT_NSECS, 0, cpu); + total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd); if (total) { ratio = avg / total; @@ -1039,7 +1041,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "Ghz", 0); } } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX)) { - total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); if (total) print_metric(config, ctxp, NULL, @@ -1049,8 +1051,8 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, print_metric(config, ctxp, NULL, NULL, "transactional cycles", 0); } else if (perf_stat_evsel__is(evsel, CYCLES_IN_TX_CP)) { - total = runtime_stat_avg(st, STAT_CYCLES, ctx, cpu); - total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES, cpu, &rsd); + total2 = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd); if (total2 < avg) total2 = avg; @@ -1060,21 +1062,19 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, else print_metric(config, ctxp, NULL, NULL, "aborted cycles", 0); } else if (perf_stat_evsel__is(evsel, TRANSACTION_START)) { - total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, - ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd); if (avg) ratio = total / avg; - if (runtime_stat_n(st, STAT_CYCLES_IN_TX, ctx, cpu) != 0) + if (runtime_stat_n(st, STAT_CYCLES_IN_TX, cpu, &rsd) != 0) print_metric(config, ctxp, NULL, "%8.0f", "cycles / transaction", ratio); else print_metric(config, ctxp, NULL, NULL, "cycles / transaction", 0); } else if (perf_stat_evsel__is(evsel, ELISION_START)) { - total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, - ctx, cpu); + total = runtime_stat_avg(st, STAT_CYCLES_IN_TX, cpu, &rsd); if (avg) ratio = total / avg; @@ -1087,28 +1087,28 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, else print_metric(config, ctxp, NULL, NULL, "CPUs utilized", 0); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FETCH_BUBBLES)) { - double fe_bound = td_fe_bound(ctx, cpu, st); + double fe_bound = td_fe_bound(cpu, st, &rsd); if (fe_bound > 0.2) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", fe_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_RETIRED)) { - double retiring = td_retiring(ctx, cpu, st); + double retiring = td_retiring(cpu, st, &rsd); if (retiring > 0.7) color = PERF_COLOR_GREEN; print_metric(config, ctxp, color, "%8.1f%%", "retiring", retiring * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_RECOVERY_BUBBLES)) { - double bad_spec = td_bad_spec(ctx, cpu, st); + double bad_spec = td_bad_spec(cpu, st, &rsd); if (bad_spec > 0.1) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", bad_spec * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_SLOTS_ISSUED)) { - double be_bound = td_be_bound(ctx, cpu, st); + double be_bound = td_be_bound(cpu, st, &rsd); const char *name = "backend bound"; static int have_recovery_bubbles = -1; @@ -1121,43 +1121,43 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, if (be_bound > 0.2) color = PERF_COLOR_RED; - if (td_total_slots(ctx, cpu, st) > 0) + if (td_total_slots(cpu, st, &rsd) > 0) print_metric(config, ctxp, color, "%8.1f%%", name, be_bound * 100.); else print_metric(config, ctxp, NULL, NULL, name, 0); } else if (perf_stat_evsel__is(evsel, TOPDOWN_RETIRING) && - full_td(ctx, cpu, st)) { - double retiring = td_metric_ratio(ctx, cpu, - STAT_TOPDOWN_RETIRING, st); - + full_td(cpu, st, &rsd)) { + double retiring = td_metric_ratio(cpu, + STAT_TOPDOWN_RETIRING, st, + &rsd); if (retiring > 0.7) color = PERF_COLOR_GREEN; print_metric(config, ctxp, color, "%8.1f%%", "retiring", retiring * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_FE_BOUND) && - full_td(ctx, cpu, st)) { - double fe_bound = td_metric_ratio(ctx, cpu, - STAT_TOPDOWN_FE_BOUND, st); - + full_td(cpu, st, &rsd)) { + double fe_bound = td_metric_ratio(cpu, + STAT_TOPDOWN_FE_BOUND, st, + &rsd); if (fe_bound > 0.2) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "frontend bound", fe_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BE_BOUND) && - full_td(ctx, cpu, st)) { - double be_bound = td_metric_ratio(ctx, cpu, - STAT_TOPDOWN_BE_BOUND, st); - + full_td(cpu, st, &rsd)) { + double be_bound = td_metric_ratio(cpu, + STAT_TOPDOWN_BE_BOUND, st, + &rsd); if (be_bound > 0.2) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "backend bound", be_bound * 100.); } else if (perf_stat_evsel__is(evsel, TOPDOWN_BAD_SPEC) && - full_td(ctx, cpu, st)) { - double bad_spec = td_metric_ratio(ctx, cpu, - STAT_TOPDOWN_BAD_SPEC, st); - + full_td(cpu, st, &rsd)) { + double bad_spec = td_metric_ratio(cpu, + STAT_TOPDOWN_BAD_SPEC, st, + &rsd); if (bad_spec > 0.1) color = PERF_COLOR_RED; print_metric(config, ctxp, color, "%8.1f%%", "bad speculation", @@ -1165,11 +1165,11 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, } else if (evsel->metric_expr) { generic_metric(config, evsel->metric_expr, evsel->metric_events, NULL, evsel->name, evsel->metric_name, NULL, 1, cpu, out, st); - } else if (runtime_stat_n(st, STAT_NSECS, 0, cpu) != 0) { + } else if (runtime_stat_n(st, STAT_NSECS, cpu, &rsd) != 0) { char unit = 'M'; char unit_buf[10]; - total = runtime_stat_avg(st, STAT_NSECS, 0, cpu); + total = runtime_stat_avg(st, STAT_NSECS, cpu, &rsd); if (total) ratio = 1000.0 * avg / total; @@ -1180,7 +1180,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, snprintf(unit_buf, sizeof(unit_buf), "%c/sec", unit); print_metric(config, ctxp, NULL, "%8.3f", unit_buf, ratio); } else if (perf_stat_evsel__is(evsel, SMI_NUM)) { - print_smi_cost(config, cpu, evsel, out, st); + print_smi_cost(config, cpu, out, st, &rsd); } else { num = 0; } From a1bf23052bdfe30ec3c693cf32feb2d79114ac16 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 15 Jan 2021 16:11:39 +0900 Subject: [PATCH 191/241] perf stat: Take cgroups into account for shadow stats As of now it doesn't consider cgroups when collecting shadow stats and metrics so counter values from different cgroups will be saved in a same slot. This resulted in incorrect numbers when those cgroups have different workloads. For example, let's look at the scenario below: cgroups A and C runs same workload which burns a cpu while cgroup B runs a light workload. $ perf stat -a -e cycles,instructions --for-each-cgroup A,B,C sleep 1 Performance counter stats for 'system wide': 3,958,116,522 cycles A 6,722,650,929 instructions A # 2.53 insn per cycle 1,132,741 cycles B 571,743 instructions B # 0.00 insn per cycle 4,007,799,935 cycles C 6,793,181,523 instructions C # 2.56 insn per cycle 1.001050869 seconds time elapsed When I run 'perf stat' with single workload, it usually shows IPC around 1.7. We can verify it (6,722,650,929.0 / 3,958,116,522 = 1.698) for cgroup A. But in this case, since cgroups are ignored, cycles are averaged so it used the lower value for IPC calculation and resulted in around 2.5. avg cycle: (3958116522 + 1132741 + 4007799935) / 3 = 2655683066 IPC (A) : 6722650929 / 2655683066 = 2.531 IPC (B) : 571743 / 2655683066 = 0.0002 IPC (C) : 6793181523 / 2655683066 = 2.557 We can simply compare cgroup pointers in the evsel and it'll be NULL when cgroups are not specified. With this patch, I can see correct numbers like below: $ perf stat -a -e cycles,instructions --for-each-cgroup A,B,C sleep 1 Performance counter stats for 'system wide': 4,171,051,687 cycles A 7,219,793,922 instructions A # 1.73 insn per cycle 1,051,189 cycles B 583,102 instructions B # 0.55 insn per cycle 4,171,124,710 cycles C 7,192,944,580 instructions C # 1.72 insn per cycle 1.007909814 seconds time elapsed Signed-off-by: Namhyung Kim Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Andi Kleen Cc: Ian Rogers Cc: Jin Yao Cc: Mark Rutland Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lore.kernel.org/lkml/20210115071139.257042-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/stat-shadow.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/tools/perf/util/stat-shadow.c b/tools/perf/util/stat-shadow.c index a1565b6e38f2..12eafd12a693 100644 --- a/tools/perf/util/stat-shadow.c +++ b/tools/perf/util/stat-shadow.c @@ -8,6 +8,7 @@ #include "evlist.h" #include "expr.h" #include "metricgroup.h" +#include "cgroup.h" #include /* @@ -28,6 +29,7 @@ struct saved_value { enum stat_type type; int ctx; int cpu; + struct cgroup *cgrp; struct runtime_stat *stat; struct stats stats; u64 metric_total; @@ -57,6 +59,9 @@ static int saved_value_cmp(struct rb_node *rb_node, const void *entry) if (a->ctx != b->ctx) return a->ctx - b->ctx; + if (a->cgrp != b->cgrp) + return (char *)a->cgrp < (char *)b->cgrp ? -1 : +1; + if (a->evsel == NULL && b->evsel == NULL) { if (a->stat == b->stat) return 0; @@ -100,7 +105,8 @@ static struct saved_value *saved_value_lookup(struct evsel *evsel, bool create, enum stat_type type, int ctx, - struct runtime_stat *st) + struct runtime_stat *st, + struct cgroup *cgrp) { struct rblist *rblist; struct rb_node *nd; @@ -110,6 +116,7 @@ static struct saved_value *saved_value_lookup(struct evsel *evsel, .type = type, .ctx = ctx, .stat = st, + .cgrp = cgrp, }; rblist = &st->value_list; @@ -197,6 +204,7 @@ void perf_stat__reset_shadow_per_stat(struct runtime_stat *st) struct runtime_stat_data { int ctx; + struct cgroup *cgrp; }; static void update_runtime_stat(struct runtime_stat *st, @@ -205,7 +213,7 @@ static void update_runtime_stat(struct runtime_stat *st, struct runtime_stat_data *rsd) { struct saved_value *v = saved_value_lookup(NULL, cpu, true, type, - rsd->ctx, st); + rsd->ctx, st, rsd->cgrp); if (v) update_stats(&v->stats, count); @@ -223,6 +231,7 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, struct saved_value *v; struct runtime_stat_data rsd = { .ctx = evsel_context(counter), + .cgrp = counter->cgrp, }; count *= counter->scale; @@ -290,13 +299,14 @@ void perf_stat__update_shadow_stats(struct evsel *counter, u64 count, update_runtime_stat(st, STAT_APERF, cpu, count, &rsd); if (counter->collect_stat) { - v = saved_value_lookup(counter, cpu, true, STAT_NONE, 0, st); + v = saved_value_lookup(counter, cpu, true, STAT_NONE, 0, st, + rsd.cgrp); update_stats(&v->stats, count); if (counter->metric_leader) v->metric_total += count; } else if (counter->metric_leader) { v = saved_value_lookup(counter->metric_leader, - cpu, true, STAT_NONE, 0, st); + cpu, true, STAT_NONE, 0, st, rsd.cgrp); v->metric_total += count; v->metric_other++; } @@ -438,7 +448,7 @@ static double runtime_stat_avg(struct runtime_stat *st, { struct saved_value *v; - v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st); + v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st, rsd->cgrp); if (!v) return 0.0; @@ -451,7 +461,7 @@ static double runtime_stat_n(struct runtime_stat *st, { struct saved_value *v; - v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st); + v = saved_value_lookup(NULL, cpu, false, type, rsd->ctx, st, rsd->cgrp); if (!v) return 0.0; @@ -805,7 +815,8 @@ static int prepare_metric(struct evsel **metric_events, scale = 1e-9; } else { v = saved_value_lookup(metric_events[i], cpu, false, - STAT_NONE, 0, st); + STAT_NONE, 0, st, + metric_events[i]->cgrp); if (!v) break; stats = &v->stats; @@ -933,6 +944,7 @@ void perf_stat__print_shadow_stats(struct perf_stat_config *config, const char *color = NULL; struct runtime_stat_data rsd = { .ctx = evsel_context(evsel), + .cgrp = evsel->cgrp, }; struct metric_event *me; int num = 1; From 5501e9229a80d95a1ea68609f44c447a75d23ed5 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 7 Jan 2021 19:41:59 +0200 Subject: [PATCH 192/241] perf intel-pt: Fix 'CPU too large' error In some cases, the number of cpus (nr_cpus_online) is confused with the maximum cpu number (nr_cpus_avail), which results in the error in the example below: Example on system with 8 cpus: Before: # echo 0 > /sys/devices/system/cpu/cpu2/online # ./perf record --kcore -e intel_pt// taskset --cpu-list 7 uname Linux [ perf record: Woken up 1 times to write data ] [ perf record: Captured and wrote 0.147 MB perf.data ] # ./perf script --itrace=e Requested CPU 7 too large. Consider raising MAX_NR_CPUS 0x25908 [0x8]: failed to process type: 68 [Invalid argument] After: # ./perf script --itrace=e # Fixes: 8c7274691f0d ("perf machine: Replace MAX_NR_CPUS with perf_env::nr_cpus_online") Fixes: 7df4e36a4785 ("perf session: Replace MAX_NR_CPUS with perf_env::nr_cpus_online") Signed-off-by: Adrian Hunter Tested-by: Kan Liang Cc: Jiri Olsa Cc: stable@vger.kernel.org Link: http://lore.kernel.org/lkml/20210107174159.24897-1-adrian.hunter@intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/machine.c | 4 ++-- tools/perf/util/session.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/machine.c b/tools/perf/util/machine.c index f841f3503cae..1e9d3f982b47 100644 --- a/tools/perf/util/machine.c +++ b/tools/perf/util/machine.c @@ -2980,7 +2980,7 @@ int machines__for_each_thread(struct machines *machines, pid_t machine__get_current_tid(struct machine *machine, int cpu) { - int nr_cpus = min(machine->env->nr_cpus_online, MAX_NR_CPUS); + int nr_cpus = min(machine->env->nr_cpus_avail, MAX_NR_CPUS); if (cpu < 0 || cpu >= nr_cpus || !machine->current_tid) return -1; @@ -2992,7 +2992,7 @@ int machine__set_current_tid(struct machine *machine, int cpu, pid_t pid, pid_t tid) { struct thread *thread; - int nr_cpus = min(machine->env->nr_cpus_online, MAX_NR_CPUS); + int nr_cpus = min(machine->env->nr_cpus_avail, MAX_NR_CPUS); if (cpu < 0) return -EINVAL; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 50ff9795a4f1..25adbcce0281 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -2404,7 +2404,7 @@ int perf_session__cpu_bitmap(struct perf_session *session, { int i, err = -1; struct perf_cpu_map *map; - int nr_cpus = min(session->header.env.nr_cpus_online, MAX_NR_CPUS); + int nr_cpus = min(session->header.env.nr_cpus_avail, MAX_NR_CPUS); for (i = 0; i < PERF_TYPE_MAX; ++i) { struct evsel *evsel; From 648b054a4647cd62e13ba79f398b8b97a7c82b19 Mon Sep 17 00:00:00 2001 From: Al Grant Date: Tue, 24 Nov 2020 19:58:17 +0000 Subject: [PATCH 193/241] perf inject: Correct event attribute sizes When 'perf inject' reads a perf.data file from an older version of perf, it writes event attributes into the output with the original size field, but lays them out as if they had the size currently used. Readers see a corrupt file. Update the size field to match the layout. Signed-off-by: Al Grant Acked-by: Jiri Olsa Cc: Alexander Shishkin Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Link: http://lore.kernel.org/lkml/20201124195818.30603-1-al.grant@arm.com Signed-off-by: Denis Nikitin Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/header.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 062383e225a3..c4ed3dc2c8f4 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -3323,6 +3323,14 @@ int perf_session__write_header(struct perf_session *session, attr_offset = lseek(ff.fd, 0, SEEK_CUR); evlist__for_each_entry(evlist, evsel) { + if (evsel->core.attr.size < sizeof(evsel->core.attr)) { + /* + * We are likely in "perf inject" and have read + * from an older file. Update attr size so that + * reader gets the right offset to the ids. + */ + evsel->core.attr.size = sizeof(evsel->core.attr); + } f_attr = (struct perf_file_attr){ .attr = evsel->core.attr, .ids = { From 235ecd36c7a93e4d6c73ac71137b8f1fa31148dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Fri, 15 Jan 2021 11:43:37 +0100 Subject: [PATCH 194/241] MAINTAINERS: Update my email address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My Intel email will stop working in a not too distant future. Move my MAINTAINERS entries to my kernel.org address. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Link: https://lore.kernel.org/bpf/20210115104337.7751-1-bjorn.topel@gmail.com --- .mailmap | 2 ++ MAINTAINERS | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 632700cee55c..b1ab0129c7d6 100644 --- a/.mailmap +++ b/.mailmap @@ -55,6 +55,8 @@ Bart Van Assche Ben Gardner Ben M Cahill Björn Steinbrink +Björn Töpel +Björn Töpel Boris Brezillon Boris Brezillon Boris Brezillon diff --git a/MAINTAINERS b/MAINTAINERS index b15514a770e3..0dfd1a67d430 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3336,7 +3336,7 @@ F: arch/riscv/net/ X: arch/riscv/net/bpf_jit_comp64.c BPF JIT for RISC-V (64-bit) -M: Björn Töpel +M: Björn Töpel L: netdev@vger.kernel.org L: bpf@vger.kernel.org S: Maintained @@ -19409,7 +19409,7 @@ F: drivers/net/ethernet/*/*/*xdp* K: (?:\b|_)xdp(?:\b|_) XDP SOCKETS (AF_XDP) -M: Björn Töpel +M: Björn Töpel M: Magnus Karlsson R: Jonathan Lemon L: netdev@vger.kernel.org From a8d13dbccb137c46fead2ec1a4f1fbc8cfc9ea91 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 15 Jan 2021 16:04:23 -0700 Subject: [PATCH 195/241] io_uring: ensure finish_wait() is always called in __io_uring_task_cancel() If we enter with requests pending and performm cancelations, we'll have a different inflight count before and after calling prepare_to_wait(). This causes the loop to restart. If we actually ended up canceling everything, or everything completed in-between, then we'll break out of the loop without calling finish_wait() on the waitqueue. This can trigger a warning on exit_signals(), as we leave the task state in TASK_UNINTERRUPTIBLE. Put a finish_wait() after the loop to catch that case. Cc: stable@vger.kernel.org # 5.9+ Signed-off-by: Jens Axboe --- fs/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/io_uring.c b/fs/io_uring.c index 06cc79d39586..985a9e3f976d 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -9101,6 +9101,7 @@ void __io_uring_task_cancel(void) finish_wait(&tctx->wait, &wait); } while (1); + finish_wait(&tctx->wait, &wait); atomic_dec(&tctx->in_idle); io_uring_remove_task_files(tctx); From c96adff95619178e2118925578343ad54857c80c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Fri, 15 Jan 2021 10:50:24 -0800 Subject: [PATCH 196/241] cls_flower: call nla_ok() before nla_next() fl_set_enc_opt() simply checks if there are still bytes left to parse, but this is not sufficent as syzbot seems to be able to generate malformatted netlink messages. nla_ok() is more strict so should be used to validate the next nlattr here. And nla_validate_nested_deprecated() has less strict check too, it is probably too late to switch to the strict version, but we can just call nla_ok() too after it. Reported-and-tested-by: syzbot+2624e3778b18fc497c92@syzkaller.appspotmail.com Fixes: 0a6e77784f49 ("net/sched: allow flower to match tunnel options") Fixes: 79b1011cb33d ("net: sched: allow flower to match erspan options") Cc: Jamal Hadi Salim Cc: Xin Long Cc: Jiri Pirko Signed-off-by: Cong Wang Link: https://lore.kernel.org/r/20210115185024.72298-1-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_flower.c | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 1319986693fc..84f932532db7 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -1272,6 +1272,10 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, nla_opt_msk = nla_data(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); msk_depth = nla_len(tb[TCA_FLOWER_KEY_ENC_OPTS_MASK]); + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "Invalid nested attribute for masks"); + return -EINVAL; + } } nla_for_each_attr(nla_opt_key, nla_enc_key, @@ -1307,9 +1311,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_VXLAN: if (key->enc_opts.dst_opt_type) { @@ -1340,9 +1341,6 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; case TCA_FLOWER_KEY_ENC_OPTS_ERSPAN: if (key->enc_opts.dst_opt_type) { @@ -1373,14 +1371,20 @@ static int fl_set_enc_opt(struct nlattr **tb, struct fl_flow_key *key, NL_SET_ERR_MSG(extack, "Key and mask miss aligned"); return -EINVAL; } - - if (msk_depth) - nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); break; default: NL_SET_ERR_MSG(extack, "Unknown tunnel option type"); return -EINVAL; } + + if (!msk_depth) + continue; + + if (!nla_ok(nla_opt_msk, msk_depth)) { + NL_SET_ERR_MSG(extack, "A mask attribute is invalid"); + return -EINVAL; + } + nla_opt_msk = nla_next(nla_opt_msk, &msk_depth); } return 0; From e4bedf48aaa5552bc1f49703abd17606e7e6e82a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jan 2021 08:06:37 -0800 Subject: [PATCH 197/241] net_sched: reject silly cell_log in qdisc_get_rtab() iproute2 probably never goes beyond 8 for the cell exponent, but stick to the max shift exponent for signed 32bit. UBSAN reported: UBSAN: shift-out-of-bounds in net/sched/sch_api.c:389:22 shift exponent 130 is too large for 32-bit type 'int' CPU: 1 PID: 8450 Comm: syz-executor586 Not tainted 5.11.0-rc3-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x183/0x22e lib/dump_stack.c:120 ubsan_epilogue lib/ubsan.c:148 [inline] __ubsan_handle_shift_out_of_bounds+0x432/0x4d0 lib/ubsan.c:395 __detect_linklayer+0x2a9/0x330 net/sched/sch_api.c:389 qdisc_get_rtab+0x2b5/0x410 net/sched/sch_api.c:435 cbq_init+0x28f/0x12c0 net/sched/sch_cbq.c:1180 qdisc_create+0x801/0x1470 net/sched/sch_api.c:1246 tc_modify_qdisc+0x9e3/0x1fc0 net/sched/sch_api.c:1662 rtnetlink_rcv_msg+0xb1d/0xe60 net/core/rtnetlink.c:5564 netlink_rcv_skb+0x1f0/0x460 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x7de/0x9b0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0xaa6/0xe90 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg net/socket.c:672 [inline] ____sys_sendmsg+0x5a2/0x900 net/socket.c:2345 ___sys_sendmsg net/socket.c:2399 [inline] __sys_sendmsg+0x319/0x400 net/socket.c:2432 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Acked-by: Cong Wang Link: https://lore.kernel.org/r/20210114160637.1660597-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 51cb553e4317..6fe4e5cc807c 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -412,7 +412,8 @@ struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r, { struct qdisc_rate_table *rtab; - if (tab == NULL || r->rate == 0 || r->cell_log == 0 || + if (tab == NULL || r->rate == 0 || + r->cell_log == 0 || r->cell_log >= 32 || nla_len(tab) != TC_RTAB_SIZE) { NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching"); return NULL; From dd5e073381f2ada3630f36be42833c6e9c78b75e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jan 2021 10:19:29 -0800 Subject: [PATCH 198/241] net_sched: gen_estimator: support large ewma log syzbot report reminded us that very big ewma_log were supported in the past, even if they made litle sense. tc qdisc replace dev xxx root est 1sec 131072sec ... While fixing the bug, also add boundary checks for ewma_log, in line with range supported by iproute2. UBSAN: shift-out-of-bounds in net/core/gen_estimator.c:83:38 shift exponent -1 is negative CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 est_timer.cold+0xbb/0x12d net/core/gen_estimator.c:83 call_timer_fn+0x1a5/0x710 kernel/time/timer.c:1417 expire_timers kernel/time/timer.c:1462 [inline] __run_timers.part.0+0x692/0xa80 kernel/time/timer.c:1731 __run_timers kernel/time/timer.c:1712 [inline] run_timer_softirq+0xb3/0x1d0 kernel/time/timer.c:1744 __do_softirq+0x2bc/0xa77 kernel/softirq.c:343 asm_call_irq_on_stack+0xf/0x20 __run_on_irqstack arch/x86/include/asm/irq_stack.h:26 [inline] run_on_irqstack_cond arch/x86/include/asm/irq_stack.h:77 [inline] do_softirq_own_stack+0xaa/0xd0 arch/x86/kernel/irq_64.c:77 invoke_softirq kernel/softirq.c:226 [inline] __irq_exit_rcu+0x17f/0x200 kernel/softirq.c:420 irq_exit_rcu+0x5/0x20 kernel/softirq.c:432 sysvec_apic_timer_interrupt+0x4d/0x100 arch/x86/kernel/apic/apic.c:1096 asm_sysvec_apic_timer_interrupt+0x12/0x20 arch/x86/include/asm/idtentry.h:628 RIP: 0010:native_save_fl arch/x86/include/asm/irqflags.h:29 [inline] RIP: 0010:arch_local_save_flags arch/x86/include/asm/irqflags.h:79 [inline] RIP: 0010:arch_irqs_disabled arch/x86/include/asm/irqflags.h:169 [inline] RIP: 0010:acpi_safe_halt drivers/acpi/processor_idle.c:111 [inline] RIP: 0010:acpi_idle_do_entry+0x1c9/0x250 drivers/acpi/processor_idle.c:516 Fixes: 1c0d32fde5bd ("net_sched: gen_estimator: complete rewrite of rate estimators") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20210114181929.1717985-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/core/gen_estimator.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 80dbf2f4016e..8e582e29a41e 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -80,11 +80,11 @@ static void est_timer(struct timer_list *t) u64 rate, brate; est_fetch_counters(est, &b); - brate = (b.bytes - est->last_bytes) << (10 - est->ewma_log - est->intvl_log); - brate -= (est->avbps >> est->ewma_log); + brate = (b.bytes - est->last_bytes) << (10 - est->intvl_log); + brate = (brate >> est->ewma_log) - (est->avbps >> est->ewma_log); - rate = (b.packets - est->last_packets) << (10 - est->ewma_log - est->intvl_log); - rate -= (est->avpps >> est->ewma_log); + rate = (b.packets - est->last_packets) << (10 - est->intvl_log); + rate = (rate >> est->ewma_log) - (est->avpps >> est->ewma_log); write_seqcount_begin(&est->seq); est->avbps += brate; @@ -143,6 +143,9 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, if (parm->interval < -2 || parm->interval > 3) return -EINVAL; + if (parm->ewma_log == 0 || parm->ewma_log >= 31) + return -EINVAL; + est = kzalloc(sizeof(*est), GFP_KERNEL); if (!est) return -ENOBUFS; From bcd0cf19ef8258ac31b9a20248b05c15a1f4b4b0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Jan 2021 10:52:29 -0800 Subject: [PATCH 199/241] net_sched: avoid shift-out-of-bounds in tcindex_set_parms() tc_index being 16bit wide, we need to check that TCA_TCINDEX_SHIFT attribute is not silly. UBSAN: shift-out-of-bounds in net/sched/cls_tcindex.c:260:29 shift exponent 255 is too large for 32-bit type 'int' CPU: 0 PID: 8516 Comm: syz-executor228 Not tainted 5.10.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:79 [inline] dump_stack+0x107/0x163 lib/dump_stack.c:120 ubsan_epilogue+0xb/0x5a lib/ubsan.c:148 __ubsan_handle_shift_out_of_bounds.cold+0xb1/0x181 lib/ubsan.c:395 valid_perfect_hash net/sched/cls_tcindex.c:260 [inline] tcindex_set_parms.cold+0x1b/0x215 net/sched/cls_tcindex.c:425 tcindex_change+0x232/0x340 net/sched/cls_tcindex.c:546 tc_new_tfilter+0x13fb/0x21b0 net/sched/cls_api.c:2127 rtnetlink_rcv_msg+0x8b6/0xb80 net/core/rtnetlink.c:5555 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2494 netlink_unicast_kernel net/netlink/af_netlink.c:1304 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1330 netlink_sendmsg+0x907/0xe40 net/netlink/af_netlink.c:1919 sock_sendmsg_nosec net/socket.c:652 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:672 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2336 ___sys_sendmsg+0xf3/0x170 net/socket.c:2390 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2423 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xa9 Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Reported-by: syzbot Link: https://lore.kernel.org/r/20210114185229.1742255-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/cls_tcindex.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 78bec347b8b6..c4007b9cd16d 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -366,9 +366,13 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base, if (tb[TCA_TCINDEX_MASK]) cp->mask = nla_get_u16(tb[TCA_TCINDEX_MASK]); - if (tb[TCA_TCINDEX_SHIFT]) + if (tb[TCA_TCINDEX_SHIFT]) { cp->shift = nla_get_u32(tb[TCA_TCINDEX_SHIFT]); - + if (cp->shift > 16) { + err = -EINVAL; + goto errout; + } + } if (!cp->hash) { /* Hash not specified, use perfect hash if the upper limit * of the hashing index is below the threshold. From a959a9782fa87669feeed095ced5d78181a7c02d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 11 Jan 2021 18:19:26 +0100 Subject: [PATCH 200/241] iov_iter: fix the uaccess area in copy_compat_iovec_from_user sizeof needs to be called on the compat pointer, not the native one. Fixes: 89cd35c58bc2 ("iov_iter: transparently handle compat iovecs in import_iovec") Reported-by: David Laight Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 1635111c5bd2..a21e6a5792c5 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1658,7 +1658,7 @@ static int copy_compat_iovec_from_user(struct iovec *iov, (const struct compat_iovec __user *)uvec; int ret = -EFAULT, i; - if (!user_access_begin(uvec, nr_segs * sizeof(*uvec))) + if (!user_access_begin(uiov, nr_segs * sizeof(*uiov))) return -EFAULT; for (i = 0; i < nr_segs; i++) { From 797f0375dd2ef5cdc68ac23450cbae9a5c67a74e Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Mon, 11 Jan 2021 15:45:01 -0800 Subject: [PATCH 201/241] RISC-V: Do not allocate memblock while iterating reserved memblocks Currently, resource tree allocates memory blocks while iterating on the list. It leads to following kernel warning because memblock allocation also invokes memory block reservation API. [ 0.000000] ------------[ cut here ]------------ [ 0.000000] WARNING: CPU: 0 PID: 0 at kernel/resource.c:795 __insert_resource+0x8e/0xd0 [ 0.000000] Modules linked in: [ 0.000000] CPU: 0 PID: 0 Comm: swapper Not tainted 5.10.0-00022-ge20097fb37e2-dirty #549 [ 0.000000] epc: c00125c2 ra : c001262c sp : c1c01f50 [ 0.000000] gp : c1d456e0 tp : c1c0a980 t0 : ffffcf20 [ 0.000000] t1 : 00000000 t2 : 00000000 s0 : c1c01f60 [ 0.000000] s1 : ffffcf00 a0 : ffffff00 a1 : c1c0c0c4 [ 0.000000] a2 : 80c12b15 a3 : 80402000 a4 : 80402000 [ 0.000000] a5 : c1c0c0c4 a6 : 80c12b15 a7 : f5faf600 [ 0.000000] s2 : c1c0c0c4 s3 : c1c0e000 s4 : c1009a80 [ 0.000000] s5 : c1c0c000 s6 : c1d48000 s7 : c1613b4c [ 0.000000] s8 : 00000fff s9 : 80000200 s10: c1613b40 [ 0.000000] s11: 00000000 t3 : c1d4a000 t4 : ffffffff This is also unnecessary as we can pre-compute the total memblocks required for each memory region and allocate it before the loop. It save precious boot time not going through memblock allocation code every time. Fixes: 00ab027a3b82 ("RISC-V: Add kernel image sections to the resource tree") Reviewed-by: Anup Patel Tested-by: Geert Uytterhoeven Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/setup.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/riscv/kernel/setup.c b/arch/riscv/kernel/setup.c index 1d85e9bf783c..3fa3f26dde85 100644 --- a/arch/riscv/kernel/setup.c +++ b/arch/riscv/kernel/setup.c @@ -127,7 +127,9 @@ static void __init init_resources(void) { struct memblock_region *region = NULL; struct resource *res = NULL; - int ret = 0; + struct resource *mem_res = NULL; + size_t mem_res_sz = 0; + int ret = 0, i = 0; code_res.start = __pa_symbol(_text); code_res.end = __pa_symbol(_etext) - 1; @@ -145,16 +147,17 @@ static void __init init_resources(void) bss_res.end = __pa_symbol(__bss_stop) - 1; bss_res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; + mem_res_sz = (memblock.memory.cnt + memblock.reserved.cnt) * sizeof(*mem_res); + mem_res = memblock_alloc(mem_res_sz, SMP_CACHE_BYTES); + if (!mem_res) + panic("%s: Failed to allocate %zu bytes\n", __func__, mem_res_sz); /* * Start by adding the reserved regions, if they overlap * with /memory regions, insert_resource later on will take * care of it. */ for_each_reserved_mem_region(region) { - res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); + res = &mem_res[i++]; res->name = "Reserved"; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; @@ -171,8 +174,10 @@ static void __init init_resources(void) * Ignore any other reserved regions within * system memory. */ - if (memblock_is_memory(res->start)) + if (memblock_is_memory(res->start)) { + memblock_free((phys_addr_t) res, sizeof(struct resource)); continue; + } ret = add_resource(&iomem_resource, res); if (ret < 0) @@ -181,10 +186,7 @@ static void __init init_resources(void) /* Add /memory regions to the resource tree */ for_each_mem_region(region) { - res = memblock_alloc(sizeof(struct resource), SMP_CACHE_BYTES); - if (!res) - panic("%s: Failed to allocate %zu bytes\n", __func__, - sizeof(struct resource)); + res = &mem_res[i++]; if (unlikely(memblock_is_nomap(region))) { res->name = "Reserved"; @@ -205,9 +207,9 @@ static void __init init_resources(void) return; error: - memblock_free((phys_addr_t) res, sizeof(struct resource)); /* Better an empty resource tree than an inconsistent one */ release_child_resources(&iomem_resource); + memblock_free((phys_addr_t) mem_res, mem_res_sz); } From abb8e86b269604e906a6a4af7a09f04b72dbb862 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Mon, 11 Jan 2021 15:45:02 -0800 Subject: [PATCH 202/241] RISC-V: Set current memblock limit Currently, linux kernel can not use last 4k bytes of addressable space because IS_ERR_VALUE macro treats those as an error. This will be an issue for RV32 as any memblock allocator potentially allocate chunk of memory from the end of DRAM (2GB) leading bad address error even though the address was technically valid. Fix this issue by limiting the memblock if available memory spans the entire address space. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index bf5379135e39..7cd4993f4ff2 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -157,9 +157,10 @@ disable: void __init setup_bootmem(void) { phys_addr_t mem_start = 0; - phys_addr_t start, end = 0; + phys_addr_t start, dram_end, end = 0; phys_addr_t vmlinux_end = __pa_symbol(&_end); phys_addr_t vmlinux_start = __pa_symbol(&_start); + phys_addr_t max_mapped_addr = __pa(~(ulong)0); u64 i; /* Find the memory region containing the kernel */ @@ -181,7 +182,18 @@ void __init setup_bootmem(void) /* Reserve from the start of the kernel to the end of the kernel */ memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); - max_pfn = PFN_DOWN(memblock_end_of_DRAM()); + dram_end = memblock_end_of_DRAM(); + + /* + * memblock allocator is not aware of the fact that last 4K bytes of + * the addressable memory can not be mapped because of IS_ERR_VALUE + * macro. Make sure that last 4k bytes are not usable by memblock + * if end of dram is equal to maximum addressable memory. + */ + if (max_mapped_addr == (dram_end - 1)) + memblock_set_current_limit(max_mapped_addr - 4096); + + max_pfn = PFN_DOWN(dram_end); max_low_pfn = max_pfn; dma32_phys_limit = min(4UL * SZ_1G, (unsigned long)PFN_PHYS(max_low_pfn)); set_max_mapnr(max_low_pfn); From e557793799c5a8406afb08aa170509619f7eac36 Mon Sep 17 00:00:00 2001 From: Atish Patra Date: Mon, 11 Jan 2021 15:45:04 -0800 Subject: [PATCH 203/241] RISC-V: Fix maximum allowed phsyical memory for RV32 Linux kernel can only map 1GB of address space for RV32 as the page offset is set to 0xC0000000. The current description in the Kconfig is confusing as it indicates that RV32 can support 2GB of physical memory. That is simply not true for current kernel. In future, a 2GB split support can be added to allow 2GB physical address space. Reviewed-by: Anup Patel Signed-off-by: Atish Patra Signed-off-by: Palmer Dabbelt --- arch/riscv/Kconfig | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 81b76d44725d..e9e2c1f0a690 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -137,7 +137,7 @@ config PA_BITS config PAGE_OFFSET hex - default 0xC0000000 if 32BIT && MAXPHYSMEM_2GB + default 0xC0000000 if 32BIT && MAXPHYSMEM_1GB default 0x80000000 if 64BIT && !MMU default 0xffffffff80000000 if 64BIT && MAXPHYSMEM_2GB default 0xffffffe000000000 if 64BIT && MAXPHYSMEM_128GB @@ -247,10 +247,12 @@ config MODULE_SECTIONS choice prompt "Maximum Physical Memory" - default MAXPHYSMEM_2GB if 32BIT + default MAXPHYSMEM_1GB if 32BIT default MAXPHYSMEM_2GB if 64BIT && CMODEL_MEDLOW default MAXPHYSMEM_128GB if 64BIT && CMODEL_MEDANY + config MAXPHYSMEM_1GB + bool "1GiB" config MAXPHYSMEM_2GB bool "2GiB" config MAXPHYSMEM_128GB From 29a951dfb3c3263c3a0f3bd9f7f2c2cfde4baedb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 8 Jan 2021 13:13:41 -0800 Subject: [PATCH 204/241] mm: fix clear_refs_write locking Turning page table entries read-only requires the mmap_sem held for writing. So stop doing the odd games with turning things from read locks to write locks and back. Just get the write lock. Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ee5a235b3056..ab7d700b2caa 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1215,41 +1215,26 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .type = type, }; + if (mmap_write_lock_killable(mm)) { + count = -EINTR; + goto out_mm; + } if (type == CLEAR_REFS_MM_HIWATER_RSS) { - if (mmap_write_lock_killable(mm)) { - count = -EINTR; - goto out_mm; - } - /* * Writing 5 to /proc/pid/clear_refs resets the peak * resident set size to this mm's current rss value. */ reset_mm_hiwater_rss(mm); - mmap_write_unlock(mm); - goto out_mm; + goto out_unlock; } - if (mmap_read_lock_killable(mm)) { - count = -EINTR; - goto out_mm; - } tlb_gather_mmu(&tlb, mm, 0, -1); if (type == CLEAR_REFS_SOFT_DIRTY) { for (vma = mm->mmap; vma; vma = vma->vm_next) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; - mmap_read_unlock(mm); - if (mmap_write_lock_killable(mm)) { - count = -EINTR; - goto out_mm; - } - for (vma = mm->mmap; vma; vma = vma->vm_next) { - vma->vm_flags &= ~VM_SOFTDIRTY; - vma_set_page_prot(vma); - } - mmap_write_downgrade(mm); - break; + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); } mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY, @@ -1261,7 +1246,8 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, 0, -1); - mmap_read_unlock(mm); +out_unlock: + mmap_write_unlock(mm); out_mm: mmput(mm); } From 9348b73c2e1bfea74ccd4a44fb4ccc7276ab9623 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 9 Jan 2021 17:09:10 -0800 Subject: [PATCH 205/241] mm: don't play games with pinned pages in clear_page_refs Turning a pinned page read-only breaks the pinning after COW. Don't do it. The whole "track page soft dirty" state doesn't work with pinned pages anyway, since the page might be dirtied by the pinning entity without ever being noticed in the page tables. Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ab7d700b2caa..602e3a52884d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1035,6 +1035,25 @@ struct clear_refs_private { }; #ifdef CONFIG_MEM_SOFT_DIRTY + +#define is_cow_mapping(flags) (((flags) & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) + +static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + struct page *page; + + if (!pte_write(pte)) + return false; + if (!is_cow_mapping(vma->vm_flags)) + return false; + if (likely(!atomic_read(&vma->vm_mm->has_pinned))) + return false; + page = vm_normal_page(vma, addr, pte); + if (!page) + return false; + return page_maybe_dma_pinned(page); +} + static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { @@ -1049,6 +1068,8 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, if (pte_present(ptent)) { pte_t old_pte; + if (pte_is_pinned(vma, addr, ptent)) + return; old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); From d36a1dd9f77ae1e72da48f4123ed35627848507d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 5 Jan 2021 14:43:46 -0500 Subject: [PATCH 206/241] dump_common_audit_data(): fix racy accesses to ->d_name We are not guaranteed the locking environment that would prevent dentry getting renamed right under us. And it's possible for old long name to be freed after rename, leading to UAF here. Cc: stable@kernel.org # v2.6.2+ Signed-off-by: Al Viro --- security/lsm_audit.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/security/lsm_audit.c b/security/lsm_audit.c index 7d8026f3f377..a0cd28cd31a8 100644 --- a/security/lsm_audit.c +++ b/security/lsm_audit.c @@ -275,7 +275,9 @@ static void dump_common_audit_data(struct audit_buffer *ab, struct inode *inode; audit_log_format(ab, " name="); + spin_lock(&a->u.dentry->d_lock); audit_log_untrustedstring(ab, a->u.dentry->d_name.name); + spin_unlock(&a->u.dentry->d_lock); inode = d_backing_inode(a->u.dentry); if (inode) { @@ -293,8 +295,9 @@ static void dump_common_audit_data(struct audit_buffer *ab, dentry = d_find_alias(inode); if (dentry) { audit_log_format(ab, " name="); - audit_log_untrustedstring(ab, - dentry->d_name.name); + spin_lock(&dentry->d_lock); + audit_log_untrustedstring(ab, dentry->d_name.name); + spin_unlock(&dentry->d_lock); dput(dentry); } audit_log_format(ab, " dev="); From 66c556025d687dbdd0f748c5e1df89c977b6c02a Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Fri, 15 Jan 2021 15:04:40 +0000 Subject: [PATCH 207/241] skbuff: back tiny skbs with kmalloc() in __netdev_alloc_skb() too Commit 3226b158e67c ("net: avoid 32 x truesize under-estimation for tiny skbs") ensured that skbs with data size lower than 1025 bytes will be kmalloc'ed to avoid excessive page cache fragmentation and memory consumption. However, the fix adressed only __napi_alloc_skb() (primarily for virtio_net and napi_get_frags()), but the issue can still be achieved through __netdev_alloc_skb(), which is still used by several drivers. Drivers often allocate a tiny skb for headers and place the rest of the frame to frags (so-called copybreak). Mirror the condition to __netdev_alloc_skb() to handle this case too. Since v1 [0]: - fix "Fixes:" tag; - refine commit message (mention copybreak usecase). [0] https://lore.kernel.org/netdev/20210114235423.232737-1-alobakin@pm.me Fixes: a1c7fff7e18f ("net: netdev_alloc_skb() use build_skb()") Signed-off-by: Alexander Lobakin Link: https://lore.kernel.org/r/20210115150354.85967-1-alobakin@pm.me Signed-off-by: Jakub Kicinski --- net/core/skbuff.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c1a6f262636a..785daff48030 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -437,7 +437,11 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, len += NET_SKB_PAD; - if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) || + /* If requested length is either too small or too big, + * we use kmalloc() for skb->head allocation. + */ + if (len <= SKB_WITH_OVERHEAD(1024) || + len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE); if (!skb) From 32c2bc8f2d855d4415c9a05b727e34649397bfbe Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Fri, 18 Dec 2020 18:35:50 +0200 Subject: [PATCH 208/241] ia64: fix build failure caused by memory model changes The change of ia64's default memory model to SPARSEMEM causes defconfig build to fail: CC kernel/async.o In file included from include/linux/numa.h:25, from include/linux/async.h:13, from kernel/async.c:47: arch/ia64/include/asm/sparsemem.h:14:40: warning: "PAGE_SHIFT" is not defined, evaluates to 0 [-Wundef] 14 | #if ((CONFIG_FORCE_MAX_ZONEORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS) | ^~~~~~~~~~ In file included from include/linux/gfp.h:6, from include/linux/xarray.h:14, from include/linux/radix-tree.h:19, from include/linux/idr.h:15, from include/linux/kernfs.h:13, from include/linux/sysfs.h:16, from include/linux/kobject.h:20, from include/linux/energy_model.h:7, from include/linux/device.h:16, from include/linux/async.h:14, from kernel/async.c:47: include/linux/mmzone.h:1156:2: error: #error Allocator MAX_ORDER exceeds SECTION_SIZE 1156 | #error Allocator MAX_ORDER exceeds SECTION_SIZE | ^~~~~ The error cause is the missing definition of PAGE_SHIFT in the calculation of SECTION_SIZE_BITS. Add include of to arch/ia64/include/asm/sparsemem.h to solve the problem. Fixes: 214496cb1870 ("ia64: make SPARSEMEM default and disable DISCONTIGMEM") Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Mike Rapoport --- arch/ia64/include/asm/sparsemem.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/ia64/include/asm/sparsemem.h b/arch/ia64/include/asm/sparsemem.h index dd8c166ffd7b..42ed5248fae9 100644 --- a/arch/ia64/include/asm/sparsemem.h +++ b/arch/ia64/include/asm/sparsemem.h @@ -3,6 +3,7 @@ #define _ASM_IA64_SPARSEMEM_H #ifdef CONFIG_SPARSEMEM +#include /* * SECTION_SIZE_BITS 2^N: how big each section will be * MAX_PHYSMEM_BITS 2^N: how much memory we can have in that space From fff7b5e6ee63c5d20406a131b260c619cdd24fd1 Mon Sep 17 00:00:00 2001 From: Dexuan Cui Date: Sat, 16 Jan 2021 14:31:36 -0800 Subject: [PATCH 209/241] x86/hyperv: Initialize clockevents after LAPIC is initialized With commit 4df4cb9e99f8, the Hyper-V direct-mode STIMER is actually initialized before LAPIC is initialized: see apic_intr_mode_init() x86_platform.apic_post_init() hyperv_init() hv_stimer_alloc() apic_bsp_setup() setup_local_APIC() setup_local_APIC() temporarily disables LAPIC, initializes it and re-eanble it. The direct-mode STIMER depends on LAPIC, and when it's registered, it can be programmed immediately and the timer can fire very soon: hv_stimer_init clockevents_config_and_register clockevents_register_device tick_check_new_device tick_setup_device tick_setup_periodic(), tick_setup_oneshot() clockevents_program_event When the timer fires in the hypervisor, if the LAPIC is in the disabled state, new versions of Hyper-V ignore the event and don't inject the timer interrupt into the VM, and hence the VM hangs when it boots. Note: when the VM starts/reboots, the LAPIC is pre-enabled by the firmware, so the window of LAPIC being temporarily disabled is pretty small, and the issue can only happen once out of 100~200 reboots for a 40-vCPU VM on one dev host, and on another host the issue doesn't reproduce after 2000 reboots. The issue is more noticeable for kdump/kexec, because the LAPIC is disabled by the first kernel, and stays disabled until the kdump/kexec kernel enables it. This is especially an issue to a Generation-2 VM (for which Hyper-V doesn't emulate the PIT timer) when CONFIG_HZ=1000 (rather than CONFIG_HZ=250) is used. Fix the issue by moving hv_stimer_alloc() to a later place where the LAPIC timer is initialized. Fixes: 4df4cb9e99f8 ("x86/hyperv: Initialize clockevents earlier in CPU onlining") Signed-off-by: Dexuan Cui Reviewed-by: Michael Kelley Link: https://lore.kernel.org/r/20210116223136.13892-1-decui@microsoft.com Signed-off-by: Wei Liu --- arch/x86/hyperv/hv_init.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/arch/x86/hyperv/hv_init.c b/arch/x86/hyperv/hv_init.c index 4638a52d8eae..6375967a8244 100644 --- a/arch/x86/hyperv/hv_init.c +++ b/arch/x86/hyperv/hv_init.c @@ -315,6 +315,25 @@ static struct syscore_ops hv_syscore_ops = { .resume = hv_resume, }; +static void (* __initdata old_setup_percpu_clockev)(void); + +static void __init hv_stimer_setup_percpu_clockev(void) +{ + /* + * Ignore any errors in setting up stimer clockevents + * as we can run with the LAPIC timer as a fallback. + */ + (void)hv_stimer_alloc(); + + /* + * Still register the LAPIC timer, because the direct-mode STIMER is + * not supported by old versions of Hyper-V. This also allows users + * to switch to LAPIC timer via /sys, if they want to. + */ + if (old_setup_percpu_clockev) + old_setup_percpu_clockev(); +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -393,10 +412,14 @@ void __init hyperv_init(void) wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); /* - * Ignore any errors in setting up stimer clockevents - * as we can run with the LAPIC timer as a fallback. + * hyperv_init() is called before LAPIC is initialized: see + * apic_intr_mode_init() -> x86_platform.apic_post_init() and + * apic_bsp_setup() -> setup_local_APIC(). The direct-mode STIMER + * depends on LAPIC, so hv_stimer_alloc() should be called from + * x86_init.timers.setup_percpu_clockev. */ - (void)hv_stimer_alloc(); + old_setup_percpu_clockev = x86_init.timers.setup_percpu_clockev; + x86_init.timers.setup_percpu_clockev = hv_stimer_setup_percpu_clockev; hv_apic_init(); From feb889fb40fafc6933339cf1cca8f770126819fb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 16 Jan 2021 15:34:57 -0800 Subject: [PATCH 210/241] mm: don't put pinned pages into the swap cache So technically there is nothing wrong with adding a pinned page to the swap cache, but the pinning obviously means that the page can't actually be free'd right now anyway, so it's a bit pointless. However, the real problem is not with it being a bit pointless: the real issue is that after we've added it to the swap cache, we'll try to unmap the page. That will succeed, because the code in mm/rmap.c doesn't know or care about pinned pages. Even the unmapping isn't fatal per se, since the page will stay around in memory due to the pinning, and we do hold the connection to it using the swap cache. But when we then touch it next and take a page fault, the logic in do_swap_page() will map it back into the process as a possibly read-only page, and we'll then break the page association on the next COW fault. Honestly, this issue could have been fixed in any of those other places: (a) we could refuse to unmap a pinned page (which makes conceptual sense), or (b) we could make sure to re-map a pinned page writably in do_swap_page(), or (c) we could just make do_wp_page() not COW the pinned page (which was what we historically did before that "mm: do_wp_page() simplification" commit). But while all of them are equally valid models for breaking this chain, not putting pinned pages into the swap cache in the first place is the simplest one by far. It's also the safest one: the reason why do_wp_page() was changed in the first place was that getting the "can I re-use this page" wrong is so fraught with errors. If you do it wrong, you end up with an incorrectly shared page. As a result, using "page_maybe_dma_pinned()" in either do_wp_page() or do_swap_page() would be a serious bug since it is only a (very good) heuristic. Re-using the page requires a hard black-and-white rule with no room for ambiguity. In contrast, saying "this page is very likely dma pinned, so let's not add it to the swap cache and try to unmap it" is an obviously safe thing to do, and if the heuristic might very rarely be a false positive, no harm is done. Fixes: 09854ba94c6a ("mm: do_wp_page() simplification") Reported-and-tested-by: Martin Raiber Cc: Pavel Begunkov Cc: Jens Axboe Cc: Peter Xu Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index 257cba79a96d..b1b574ad199d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1238,6 +1238,8 @@ static unsigned int shrink_page_list(struct list_head *page_list, if (!PageSwapCache(page)) { if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; + if (page_maybe_dma_pinned(page)) + goto keep_locked; if (PageTransHuge(page)) { /* cannot split THP, skip it */ if (!can_split_huge_page(page, NULL)) From 19c329f6808995b142b3966301f217c831e7cf31 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 17 Jan 2021 16:37:05 -0800 Subject: [PATCH 211/241] Linux 5.11-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9e73f82e0d86..b0e4767735dc 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 11 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Kleptomaniac Octopus # *DOCUMENTATION* From bd9dcef67ffcae2de49e319fba349df76472fd10 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 15 Jan 2021 11:11:23 -0800 Subject: [PATCH 212/241] x86/xen: fix 'nopvspin' build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix build error in x86/xen/ when PARAVIRT_SPINLOCKS is not enabled. Fixes this build error: ../arch/x86/xen/smp_hvm.c: In function ‘xen_hvm_smp_init’: ../arch/x86/xen/smp_hvm.c:77:3: error: ‘nopvspin’ undeclared (first use in this function) nopvspin = true; Fixes: 3d7746bea925 ("x86/xen: Fix xen_hvm_smp_init() when vector callback not available") Signed-off-by: Randy Dunlap Reviewed-by: Juergen Gross Cc: David Woodhouse Cc: Juergen Gross Link: https://lore.kernel.org/r/20210115191123.27572-1-rdunlap@infradead.org Signed-off-by: Juergen Gross --- arch/x86/xen/smp_hvm.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/xen/smp_hvm.c b/arch/x86/xen/smp_hvm.c index 056430a1080b..6ff3c887e0b9 100644 --- a/arch/x86/xen/smp_hvm.c +++ b/arch/x86/xen/smp_hvm.c @@ -74,7 +74,9 @@ void __init xen_hvm_smp_init(void) smp_ops.cpu_die = xen_hvm_cpu_die; if (!xen_have_vector_callback) { +#ifdef CONFIG_PARAVIRT_SPINLOCKS nopvspin = true; +#endif return; } From 79267ae22615496655feee2db0848f6786bcf67a Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 18 Jan 2021 15:52:10 +0200 Subject: [PATCH 213/241] net: mscc: ocelot: allow offloading of bridge on top of LAG The blamed commit was too aggressive, and it made ocelot_netdevice_event react only to network interface events emitted for the ocelot switch ports. In fact, only the PRECHANGEUPPER should have had that check. When we ignore all events that are not for us, we miss the fact that the upper of the LAG changes, and the bonding interface gets enslaved to a bridge. This is an operation we could offload under certain conditions. Fixes: 7afb3e575e5a ("net: mscc: ocelot: don't handle netdev events for other netdevs") Signed-off-by: Vladimir Oltean Reviewed-by: Alexandre Belloni Link: https://lore.kernel.org/r/20210118135210.2666246-1-olteanv@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot_net.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot_net.c b/drivers/net/ethernet/mscc/ocelot_net.c index 2bd2840d88bd..42230f92ca9c 100644 --- a/drivers/net/ethernet/mscc/ocelot_net.c +++ b/drivers/net/ethernet/mscc/ocelot_net.c @@ -1042,10 +1042,8 @@ static int ocelot_netdevice_event(struct notifier_block *unused, struct net_device *dev = netdev_notifier_info_to_dev(ptr); int ret = 0; - if (!ocelot_netdevice_dev_check(dev)) - return 0; - if (event == NETDEV_PRECHANGEUPPER && + ocelot_netdevice_dev_check(dev) && netif_is_lag_master(info->upper_dev)) { struct netdev_lag_upper_info *lag_upper_info = info->upper_info; struct netlink_ext_ack *extack; From 87fe04367d842c4d97a77303242d4dd4ac351e46 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Sat, 16 Jan 2021 03:39:35 +0100 Subject: [PATCH 214/241] net: dsa: mv88e6xxx: also read STU state in mv88e6250_g1_vtu_getnext mv88e6xxx_port_vlan_join checks whether the VTU already contains an entry for the given vid (via mv88e6xxx_vtu_getnext), and if so, merely changes the relevant .member[] element and loads the updated entry into the VTU. However, at least for the mv88e6250, the on-stack struct mv88e6xxx_vtu_entry vlan never has its .state[] array explicitly initialized, neither in mv88e6xxx_port_vlan_join() nor inside the getnext implementation. So the new entry has random garbage for the STU bits, breaking VLAN filtering. When the VTU entry is initially created, those bits are all zero, and we should make sure to keep them that way when the entry is updated. Fixes: 92307069a96c (net: dsa: mv88e6xxx: Avoid VTU corruption on 6097) Signed-off-by: Rasmus Villemoes Reviewed-by: Florian Fainelli Reviewed-by: Tobias Waldekranz Tested-by: Tobias Waldekranz Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/global1_vtu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/global1_vtu.c b/drivers/net/dsa/mv88e6xxx/global1_vtu.c index 66ddf67b8737..7b96396be609 100644 --- a/drivers/net/dsa/mv88e6xxx/global1_vtu.c +++ b/drivers/net/dsa/mv88e6xxx/global1_vtu.c @@ -351,6 +351,10 @@ int mv88e6250_g1_vtu_getnext(struct mv88e6xxx_chip *chip, if (err) return err; + err = mv88e6185_g1_stu_data_read(chip, entry); + if (err) + return err; + /* VTU DBNum[3:0] are located in VTU Operation 3:0 * VTU DBNum[5:4] are located in VTU Operation 9:8 */ From a826b04303a40d52439aa141035fca5654ccaccd Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 15 Jan 2021 19:42:08 +0100 Subject: [PATCH 215/241] ipv6: create multicast route with RTPROT_KERNEL The ff00::/8 multicast route is created without specifying the fc_protocol field, so the default RTPROT_BOOT value is used: $ ip -6 -d route unicast ::1 dev lo proto kernel scope global metric 256 pref medium unicast fe80::/64 dev eth0 proto kernel scope global metric 256 pref medium unicast ff00::/8 dev eth0 proto boot scope global metric 256 pref medium As the documentation says, this value identifies routes installed during boot, but the route is created when interface is set up. Change the value to RTPROT_KERNEL which is a better value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Matteo Croce Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index eff2cacd5209..19bf6822911c 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2469,6 +2469,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_flags = RTF_UP, .fc_type = RTN_UNICAST, .fc_nlinfo.nl_net = dev_net(dev), + .fc_protocol = RTPROT_KERNEL, }; ipv6_addr_set(&cfg.fc_dst, htonl(0xFF000000), 0, 0, 0); From ceed9038b2783d14e0422bdc6fd04f70580efb4c Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Fri, 15 Jan 2021 19:42:09 +0100 Subject: [PATCH 216/241] ipv6: set multicast flag on the multicast route The multicast route ff00::/8 is created with type RTN_UNICAST: $ ip -6 -d route unicast ::1 dev lo proto kernel scope global metric 256 pref medium unicast fe80::/64 dev eth0 proto kernel scope global metric 256 pref medium unicast ff00::/8 dev eth0 proto kernel scope global metric 256 pref medium Set the type to RTN_MULTICAST which is more appropriate. Fixes: e8478e80e5a7 ("net/ipv6: Save route type in rt6_info") Signed-off-by: Matteo Croce Signed-off-by: Jakub Kicinski --- net/ipv6/addrconf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 19bf6822911c..9edc5bb2d531 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2467,7 +2467,7 @@ static void addrconf_add_mroute(struct net_device *dev) .fc_ifindex = dev->ifindex, .fc_dst_len = 8, .fc_flags = RTF_UP, - .fc_type = RTN_UNICAST, + .fc_type = RTN_MULTICAST, .fc_nlinfo.nl_net = dev_net(dev), .fc_protocol = RTPROT_KERNEL, }; From 9d9b1ee0b2d1c9e02b2338c4a4b0a062d2d3edac Mon Sep 17 00:00:00 2001 From: Enke Chen Date: Fri, 15 Jan 2021 14:30:58 -0800 Subject: [PATCH 217/241] tcp: fix TCP_USER_TIMEOUT with zero window The TCP session does not terminate with TCP_USER_TIMEOUT when data remain untransmitted due to zero window. The number of unanswered zero-window probes (tcp_probes_out) is reset to zero with incoming acks irrespective of the window size, as described in tcp_probe_timer(): RFC 1122 4.2.2.17 requires the sender to stay open indefinitely as long as the receiver continues to respond probes. We support this by default and reset icsk_probes_out with incoming ACKs. This counter, however, is the wrong one to be used in calculating the duration that the window remains closed and data remain untransmitted. Thanks to Jonathan Maxwell for diagnosing the actual issue. In this patch a new timestamp is introduced for the socket in order to track the elapsed time for the zero-window probes that have not been answered with any non-zero window ack. Fixes: 9721e709fa68 ("tcp: simplify window probe aborting on USER_TIMEOUT") Reported-by: William McCall Co-developed-by: Neal Cardwell Signed-off-by: Neal Cardwell Signed-off-by: Enke Chen Reviewed-by: Yuchung Cheng Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210115223058.GA39267@localhost.localdomain Signed-off-by: Jakub Kicinski --- include/net/inet_connection_sock.h | 3 +++ net/ipv4/inet_connection_sock.c | 1 + net/ipv4/tcp.c | 1 + net/ipv4/tcp_input.c | 1 + net/ipv4/tcp_output.c | 1 + net/ipv4/tcp_timer.c | 14 +++++++------- 6 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 7338b3865a2a..111d7771b208 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -76,6 +76,8 @@ struct inet_connection_sock_af_ops { * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data * @icsk_mtup; MTU probing control data + * @icsk_probes_tstamp: Probe timestamp (cleared by non-zero window ack) + * @icsk_user_timeout: TCP_USER_TIMEOUT value */ struct inet_connection_sock { /* inet_sock has to be the first member! */ @@ -129,6 +131,7 @@ struct inet_connection_sock { u32 probe_timestamp; } icsk_mtup; + u32 icsk_probes_tstamp; u32 icsk_user_timeout; u64 icsk_ca_priv[104 / sizeof(u64)]; diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index fd8b8800a2c3..6bd7ca09af03 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -851,6 +851,7 @@ struct sock *inet_csk_clone_lock(const struct sock *sk, newicsk->icsk_retransmits = 0; newicsk->icsk_backoff = 0; newicsk->icsk_probes_out = 0; + newicsk->icsk_probes_tstamp = 0; /* Deinitialize accept_queue to trap illegal accesses. */ memset(&newicsk->icsk_accept_queue, 0, sizeof(newicsk->icsk_accept_queue)); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ed42d2193c5c..32545ecf2ab1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2937,6 +2937,7 @@ int tcp_disconnect(struct sock *sk, int flags) icsk->icsk_backoff = 0; icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; icsk->icsk_rto = TCP_TIMEOUT_INIT; icsk->icsk_rto_min = TCP_RTO_MIN; icsk->icsk_delack_max = TCP_DELACK_MAX; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c7e16b0ed791..bafcab75f425 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3384,6 +3384,7 @@ static void tcp_ack_probe(struct sock *sk) return; if (!after(TCP_SKB_CB(head)->end_seq, tcp_wnd_end(tp))) { icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); /* Socket must be waked up by subsequent tcp_data_snd_check(). * This function is not for random using! diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f322e798a351..ab458697881e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -4084,6 +4084,7 @@ void tcp_send_probe0(struct sock *sk) /* Cancel probe timer, if it is not required. */ icsk->icsk_probes_out = 0; icsk->icsk_backoff = 0; + icsk->icsk_probes_tstamp = 0; return; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 6c62b9ea1320..454732ecc8f3 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -349,6 +349,7 @@ static void tcp_probe_timer(struct sock *sk) if (tp->packets_out || !skb) { icsk->icsk_probes_out = 0; + icsk->icsk_probes_tstamp = 0; return; } @@ -360,13 +361,12 @@ static void tcp_probe_timer(struct sock *sk) * corresponding system limit. We also implement similar policy when * we use RTO to probe window in tcp_retransmit_timer(). */ - if (icsk->icsk_user_timeout) { - u32 elapsed = tcp_model_timeout(sk, icsk->icsk_probes_out, - tcp_probe0_base(sk)); - - if (elapsed >= icsk->icsk_user_timeout) - goto abort; - } + if (!icsk->icsk_probes_tstamp) + icsk->icsk_probes_tstamp = tcp_jiffies32; + else if (icsk->icsk_user_timeout && + (s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >= + msecs_to_jiffies(icsk->icsk_user_timeout)) + goto abort; max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2; if (sock_flag(sk, SOCK_DEAD)) { From 7e238de8283acd32c26c2bc2a50672d0ea862ff7 Mon Sep 17 00:00:00 2001 From: Oleksandr Mazur Date: Tue, 19 Jan 2021 10:53:33 +0200 Subject: [PATCH 218/241] net: core: devlink: use right genl user_ptr when handling port param get/set Fix incorrect user_ptr dereferencing when handling port param get/set: idx [0] stores the 'struct devlink' pointer; idx [1] stores the 'struct devlink_port' pointer; Fixes: 637989b5d77e ("devlink: Always use user_ptr[0] for devlink and simplify post_doit") CC: Parav Pandit Signed-off-by: Oleksandr Mazur Signed-off-by: Vadym Kochan Link: https://lore.kernel.org/r/20210119085333.16833-1-vadym.kochan@plvision.eu Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index ee828e4b1007..738d4344d679 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -4146,7 +4146,7 @@ out: static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; struct devlink_param_item *param_item; struct sk_buff *msg; int err; @@ -4175,7 +4175,7 @@ static int devlink_nl_cmd_port_param_get_doit(struct sk_buff *skb, static int devlink_nl_cmd_port_param_set_doit(struct sk_buff *skb, struct genl_info *info) { - struct devlink_port *devlink_port = info->user_ptr[0]; + struct devlink_port *devlink_port = info->user_ptr[1]; return __devlink_nl_cmd_param_set_doit(devlink_port->devlink, devlink_port->index, From 8eed01b5ca9c1deff329ad44f08e2041ca14842c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Jan 2021 16:06:55 +0100 Subject: [PATCH 219/241] mdio-bitbang: Export mdiobb_{read,write}() Export mdiobb_read() and mdiobb_write(), so Ethernet controller drivers can call them from their MDIO read/write wrappers. Signed-off-by: Geert Uytterhoeven Tested-by: Wolfram Sang Reviewed-by: Florian Fainelli Signed-off-by: Jakub Kicinski --- drivers/net/mdio/mdio-bitbang.c | 6 ++++-- include/linux/mdio-bitbang.h | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/net/mdio/mdio-bitbang.c b/drivers/net/mdio/mdio-bitbang.c index 5136275c8e73..d3915f831854 100644 --- a/drivers/net/mdio/mdio-bitbang.c +++ b/drivers/net/mdio/mdio-bitbang.c @@ -149,7 +149,7 @@ static int mdiobb_cmd_addr(struct mdiobb_ctrl *ctrl, int phy, u32 addr) return dev_addr; } -static int mdiobb_read(struct mii_bus *bus, int phy, int reg) +int mdiobb_read(struct mii_bus *bus, int phy, int reg) { struct mdiobb_ctrl *ctrl = bus->priv; int ret, i; @@ -180,8 +180,9 @@ static int mdiobb_read(struct mii_bus *bus, int phy, int reg) mdiobb_get_bit(ctrl); return ret; } +EXPORT_SYMBOL(mdiobb_read); -static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val) +int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val) { struct mdiobb_ctrl *ctrl = bus->priv; @@ -201,6 +202,7 @@ static int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val) mdiobb_get_bit(ctrl); return 0; } +EXPORT_SYMBOL(mdiobb_write); struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl) { diff --git a/include/linux/mdio-bitbang.h b/include/linux/mdio-bitbang.h index 5d71e8a8500f..aca4dc037b70 100644 --- a/include/linux/mdio-bitbang.h +++ b/include/linux/mdio-bitbang.h @@ -35,6 +35,9 @@ struct mdiobb_ctrl { const struct mdiobb_ops *ops; }; +int mdiobb_read(struct mii_bus *bus, int phy, int reg); +int mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val); + /* The returned bus is not yet registered with the phy layer. */ struct mii_bus *alloc_mdio_bitbang(struct mdiobb_ctrl *ctrl); From 02cae02a7de1484095e4ba984bfee7a75843ec26 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Jan 2021 16:06:56 +0100 Subject: [PATCH 220/241] sh_eth: Make PHY access aware of Runtime PM to fix reboot crash Wolfram reports that his R-Car H2-based Lager board can no longer be rebooted in v5.11-rc1, as it crashes with an imprecise external abort. The issue can be reproduced on other boards (e.g. Koelsch with R-Car M2-W) too, if CONFIG_IP_PNP is disabled, and the Ethernet interface is down at reboot time: Unhandled fault: imprecise external abort (0x1406) at 0x00000000 pgd = (ptrval) [00000000] *pgd=422b6835, *pte=00000000, *ppte=00000000 Internal error: : 1406 [#1] ARM Modules linked in: CPU: 0 PID: 1105 Comm: init Tainted: G W 5.10.0-rc1-00402-ge2f016cf7751 #1048 Hardware name: Generic R-Car Gen2 (Flattened Device Tree) PC is at sh_mdio_ctrl+0x44/0x60 LR is at sh_mmd_ctrl+0x20/0x24 ... Backtrace: [] (sh_mdio_ctrl) from [] (sh_mmd_ctrl+0x20/0x24) r7:0000001f r6:00000020 r5:00000002 r4:c22a1dc4 [] (sh_mmd_ctrl) from [] (mdiobb_cmd+0x38/0xa8) [] (mdiobb_cmd) from [] (mdiobb_read+0x58/0xdc) r9:c229f844 r8:c0c329dc r7:c221e000 r6:00000001 r5:c22a1dc4 r4:00000001 [] (mdiobb_read) from [] (__mdiobus_read+0x74/0xe0) r7:0000001f r6:00000001 r5:c221e000 r4:c221e000 [] (__mdiobus_read) from [] (mdiobus_read+0x40/0x54) r7:0000001f r6:00000001 r5:c221e000 r4:c221e458 [] (mdiobus_read) from [] (phy_read+0x1c/0x20) r7:ffffe000 r6:c221e470 r5:00000200 r4:c229f800 [] (phy_read) from [] (kszphy_config_intr+0x44/0x80) [] (kszphy_config_intr) from [] (phy_disable_interrupts+0x44/0x50) r5:c229f800 r4:c229f800 [] (phy_disable_interrupts) from [] (phy_shutdown+0x18/0x1c) r5:c229f800 r4:c229f804 [] (phy_shutdown) from [] (device_shutdown+0x168/0x1f8) [] (device_shutdown) from [] (kernel_restart_prepare+0x3c/0x48) r9:c22d2000 r8:c0100264 r7:c0b0d034 r6:00000000 r5:4321fedc r4:00000000 [] (kernel_restart_prepare) from [] (kernel_restart+0x1c/0x60) [] (kernel_restart) from [] (__do_sys_reboot+0x168/0x208) r5:4321fedc r4:01234567 [] (__do_sys_reboot) from [] (sys_reboot+0x18/0x1c) r7:00000058 r6:00000000 r5:00000000 r4:00000000 [] (sys_reboot) from [] (ret_fast_syscall+0x0/0x54) As of commit e2f016cf775129c0 ("net: phy: add a shutdown procedure"), system reboot calls phy_disable_interrupts() during shutdown. As this happens unconditionally, the PHY registers may be accessed while the device is suspended, causing undefined behavior, which may crash the system. Fix this by wrapping the PHY bitbang accessors in the sh_eth driver by wrappers that take care of Runtime PM, to resume the device when needed. Reported-by: Wolfram Sang Suggested-by: Andrew Lunn Signed-off-by: Geert Uytterhoeven Tested-by: Wolfram Sang Reviewed-by: Florian Fainelli Reviewed-by: Andrew Lunn Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/sh_eth.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index c63304632935..9b52d350e21a 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -3034,6 +3034,28 @@ static int sh_mdio_release(struct sh_eth_private *mdp) return 0; } +static int sh_mdiobb_read(struct mii_bus *bus, int phy, int reg) +{ + int res; + + pm_runtime_get_sync(bus->parent); + res = mdiobb_read(bus, phy, reg); + pm_runtime_put(bus->parent); + + return res; +} + +static int sh_mdiobb_write(struct mii_bus *bus, int phy, int reg, u16 val) +{ + int res; + + pm_runtime_get_sync(bus->parent); + res = mdiobb_write(bus, phy, reg, val); + pm_runtime_put(bus->parent); + + return res; +} + /* MDIO bus init function */ static int sh_mdio_init(struct sh_eth_private *mdp, struct sh_eth_plat_data *pd) @@ -3058,6 +3080,10 @@ static int sh_mdio_init(struct sh_eth_private *mdp, if (!mdp->mii_bus) return -ENOMEM; + /* Wrap accessors with Runtime PM-aware ops */ + mdp->mii_bus->read = sh_mdiobb_read; + mdp->mii_bus->write = sh_mdiobb_write; + /* Hook up MII support for ethtool */ mdp->mii_bus->name = "sh_mii"; mdp->mii_bus->parent = dev; From 301a33d51880619d0c5a581b5a48d3a5248fa84b Mon Sep 17 00:00:00 2001 From: Mircea Cirjaliu Date: Tue, 19 Jan 2021 21:53:18 +0100 Subject: [PATCH 221/241] bpf: Fix helper bpf_map_peek_elem_proto pointing to wrong callback I assume this was obtained by copy/paste. Point it to bpf_map_peek_elem() instead of bpf_map_pop_elem(). In practice it may have been less likely hit when under JIT given shielded via 84430d4232c3 ("bpf, verifier: avoid retpoline for map push/pop/peek operation"). Fixes: f1a2e44a3aec ("bpf: add queue and stack maps") Signed-off-by: Mircea Cirjaliu Signed-off-by: Daniel Borkmann Cc: Mauricio Vasquez Link: https://lore.kernel.org/bpf/AM7PR02MB6082663DFDCCE8DA7A6DD6B1BBA30@AM7PR02MB6082.eurprd02.prod.outlook.com --- kernel/bpf/helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index bd8a3183d030..41ca280b1dc1 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -108,7 +108,7 @@ BPF_CALL_2(bpf_map_peek_elem, struct bpf_map *, map, void *, value) } const struct bpf_func_proto bpf_map_peek_elem_proto = { - .func = bpf_map_pop_elem, + .func = bpf_map_peek_elem, .gpl_only = false, .ret_type = RET_INTEGER, .arg1_type = ARG_CONST_MAP_PTR, From b425e24a934e21a502d25089c6c7443d799c5594 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Mon, 18 Jan 2021 18:03:33 +0200 Subject: [PATCH 222/241] xsk: Clear pool even for inactive queues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The number of queues can change by other means, rather than ethtool. For example, attaching an mqprio qdisc with num_tc > 1 leads to creating multiple sets of TX queues, which may be then destroyed when mqprio is deleted. If an AF_XDP socket is created while mqprio is active, dev->_tx[queue_id].pool will be filled, but then real_num_tx_queues may decrease with deletion of mqprio, which will mean that the pool won't be NULLed, and a further increase of the number of TX queues may expose a dangling pointer. To avoid any potential misbehavior, this commit clears pool for RX and TX queues, regardless of real_num_*_queues, still taking into consideration num_*_queues to avoid overflows. Fixes: 1c1efc2af158 ("xsk: Create and free buffer pool independently from umem") Fixes: a41b4f3c58dd ("xsk: simplify xdp_clear_umem_at_qid implementation") Signed-off-by: Maxim Mikityanskiy Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/20210118160333.333439-1-maximmi@mellanox.com --- net/xdp/xsk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 8037b04a9edd..4a83117507f5 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -108,9 +108,9 @@ EXPORT_SYMBOL(xsk_get_pool_from_qid); void xsk_clear_pool_at_qid(struct net_device *dev, u16 queue_id) { - if (queue_id < dev->real_num_rx_queues) + if (queue_id < dev->num_rx_queues) dev->_rx[queue_id].pool = NULL; - if (queue_id < dev->real_num_tx_queues) + if (queue_id < dev->num_tx_queues) dev->_tx[queue_id].pool = NULL; } From 8d2b51b008c25240914984208b2ced57d1dd25a5 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 16 Jan 2021 11:44:22 +0100 Subject: [PATCH 223/241] udp: mask TOS bits in udp_v4_early_demux() udp_v4_early_demux() is the only function that calls ip_mc_validate_source() with a TOS that hasn't been masked with IPTOS_RT_MASK. This results in different behaviours for incoming multicast UDPv4 packets, depending on if ip_mc_validate_source() is called from the early-demux path (udp_v4_early_demux) or from the regular input path (ip_route_input_noref). ECN would normally not be used with UDP multicast packets, so the practical consequences should be limited on that side. However, IPTOS_RT_MASK is used to also masks the TOS' high order bits, to align with the non-early-demux path behaviour. Reproducer: Setup two netns, connected with veth: $ ip netns add ns0 $ ip netns add ns1 $ ip -netns ns0 link set dev lo up $ ip -netns ns1 link set dev lo up $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1 $ ip -netns ns0 link set dev veth01 up $ ip -netns ns1 link set dev veth10 up $ ip -netns ns0 address add 192.0.2.10 peer 192.0.2.11/32 dev veth01 $ ip -netns ns1 address add 192.0.2.11 peer 192.0.2.10/32 dev veth10 In ns0, add route to multicast address 224.0.2.0/24 using source address 198.51.100.10: $ ip -netns ns0 address add 198.51.100.10/32 dev lo $ ip -netns ns0 route add 224.0.2.0/24 dev veth01 src 198.51.100.10 In ns1, define route to 198.51.100.10, only for packets with TOS 4: $ ip -netns ns1 route add 198.51.100.10/32 tos 4 dev veth10 Also activate rp_filter in ns1, so that incoming packets not matching the above route get dropped: $ ip netns exec ns1 sysctl -wq net.ipv4.conf.veth10.rp_filter=1 Now try to receive packets on 224.0.2.11: $ ip netns exec ns1 socat UDP-RECVFROM:1111,ip-add-membership=224.0.2.11:veth10,ignoreeof - In ns0, send packet to 224.0.2.11 with TOS 4 and ECT(0) (that is, tos 6 for socat): $ echo test0 | ip netns exec ns0 socat - UDP-DATAGRAM:224.0.2.11:1111,bind=:1111,tos=6 The "test0" message is properly received by socat in ns1, because early-demux has no cached dst to use, so source address validation is done by ip_route_input_mc(), which receives a TOS that has the ECN bits masked. Now send another packet to 224.0.2.11, still with TOS 4 and ECT(0): $ echo test1 | ip netns exec ns0 socat - UDP-DATAGRAM:224.0.2.11:1111,bind=:1111,tos=6 The "test1" message isn't received by socat in ns1, because, now, early-demux has a cached dst to use and calls ip_mc_validate_source() immediately, without masking the ECN bits. Fixes: bc044e8db796 ("udp: perform source validation for mcast early demux") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 7103b0a89756..69ea76578abb 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -2555,7 +2555,8 @@ int udp_v4_early_demux(struct sk_buff *skb) */ if (!inet_sk(sk)->inet_daddr && in_dev) return ip_mc_validate_source(skb, iph->daddr, - iph->saddr, iph->tos, + iph->saddr, + iph->tos & IPTOS_RT_MASK, skb->dev, in_dev, &itag); } return 0; From 2e5a6266fbb11ae93c468dfecab169aca9c27b43 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 16 Jan 2021 11:44:26 +0100 Subject: [PATCH 224/241] netfilter: rpfilter: mask ecn bits before fib lookup RT_TOS() only masks one of the two ECN bits. Therefore rpfilter_mt() treats Not-ECT or ECT(1) packets in a different way than those with ECT(0) or CE. Reproducer: Create two netns, connected with a veth: $ ip netns add ns0 $ ip netns add ns1 $ ip link add name veth01 netns ns0 type veth peer name veth10 netns ns1 $ ip -netns ns0 link set dev veth01 up $ ip -netns ns1 link set dev veth10 up $ ip -netns ns0 address add 192.0.2.10/32 dev veth01 $ ip -netns ns1 address add 192.0.2.11/32 dev veth10 Add a route to ns1 in ns0: $ ip -netns ns0 route add 192.0.2.11/32 dev veth01 In ns1, only packets with TOS 4 can be routed to ns0: $ ip -netns ns1 route add 192.0.2.10/32 tos 4 dev veth10 Ping from ns0 to ns1 works regardless of the ECN bits, as long as TOS is 4: $ ip netns exec ns0 ping -Q 4 192.0.2.11 # TOS 4, Not-ECT ... 0% packet loss ... $ ip netns exec ns0 ping -Q 5 192.0.2.11 # TOS 4, ECT(1) ... 0% packet loss ... $ ip netns exec ns0 ping -Q 6 192.0.2.11 # TOS 4, ECT(0) ... 0% packet loss ... $ ip netns exec ns0 ping -Q 7 192.0.2.11 # TOS 4, CE ... 0% packet loss ... Now use iptable's rpfilter module in ns1: $ ip netns exec ns1 iptables-legacy -t raw -A PREROUTING -m rpfilter --invert -j DROP Not-ECT and ECT(1) packets still pass: $ ip netns exec ns0 ping -Q 4 192.0.2.11 # TOS 4, Not-ECT ... 0% packet loss ... $ ip netns exec ns0 ping -Q 5 192.0.2.11 # TOS 4, ECT(1) ... 0% packet loss ... But ECT(0) and ECN packets are dropped: $ ip netns exec ns0 ping -Q 6 192.0.2.11 # TOS 4, ECT(0) ... 100% packet loss ... $ ip netns exec ns0 ping -Q 7 192.0.2.11 # TOS 4, CE ... 100% packet loss ... After this patch, rpfilter doesn't drop ECT(0) and CE packets anymore. Fixes: 8f97339d3feb ("netfilter: add ipv4 reverse path filter match") Signed-off-by: Guillaume Nault Signed-off-by: Jakub Kicinski --- net/ipv4/netfilter/ipt_rpfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index cc23f1ce239c..8cd3224d913e 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -76,7 +76,7 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.daddr = iph->saddr; flow.saddr = rpfilter_get_saddr(iph->daddr); flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; - flow.flowi4_tos = RT_TOS(iph->tos); + flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); From a3eb4e9d4c9218476d05c52dfd2be3d6fdce6b91 Mon Sep 17 00:00:00 2001 From: Tariq Toukan Date: Sun, 17 Jan 2021 17:15:38 +0200 Subject: [PATCH 225/241] net: Disable NETIF_F_HW_TLS_RX when RXCSUM is disabled With NETIF_F_HW_TLS_RX packets are decrypted in HW. This cannot be logically done when RXCSUM offload is off. Fixes: 14136564c8ee ("net: Add TLS RX offload feature") Signed-off-by: Tariq Toukan Reviewed-by: Boris Pismenny Link: https://lore.kernel.org/r/20210117151538.9411-1-tariqt@nvidia.com Signed-off-by: Jakub Kicinski --- Documentation/networking/tls-offload.rst | 3 +++ net/core/dev.c | 5 +++++ 2 files changed, 8 insertions(+) diff --git a/Documentation/networking/tls-offload.rst b/Documentation/networking/tls-offload.rst index 9af3334d9ad0..5f0dea3d571e 100644 --- a/Documentation/networking/tls-offload.rst +++ b/Documentation/networking/tls-offload.rst @@ -534,3 +534,6 @@ offload. Hence, TLS TX device feature flag requires TX csum offload being set. Disabling the latter implies clearing the former. Disabling TX checksum offload should not affect old connections, and drivers should make sure checksum calculation does not break for them. +Similarly, device-offloaded TLS decryption implies doing RXCSUM. If the user +does not want to enable RX csum offload, TLS RX device feature is disabled +as well. diff --git a/net/core/dev.c b/net/core/dev.c index c360bb5367e2..a979b86dbacd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -9672,6 +9672,11 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } + if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) { + netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n"); + features &= ~NETIF_F_HW_TLS_RX; + } + return features; } From f6a2e94b3f9d89cb40771ff746b16b5687650cbb Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 18 Jan 2021 16:08:12 +0100 Subject: [PATCH 226/241] sh_eth: Fix power down vs. is_opened flag ordering MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sh_eth_close() does a synchronous power down of the device before marking it closed. Revert the order, to make sure the device is never marked opened while suspended. While at it, use pm_runtime_put() instead of pm_runtime_put_sync(), as there is no reason to do a synchronous power down. Fixes: 7fa2955ff70ce453 ("sh_eth: Fix sleeping function called from invalid context") Signed-off-by: Geert Uytterhoeven Reviewed-by: Sergei Shtylyov Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20210118150812.796791-1-geert+renesas@glider.be Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/renesas/sh_eth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 9b52d350e21a..590b088bc4c7 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2606,10 +2606,10 @@ static int sh_eth_close(struct net_device *ndev) /* Free all the skbuffs in the Rx queue and the DMA buffer. */ sh_eth_ring_free(ndev); - pm_runtime_put_sync(&mdp->pdev->dev); - mdp->is_opened = 0; + pm_runtime_put(&mdp->pdev->dev); + return 0; } From 4964e5a1e080f785f5518b402a9e48c527fe6cbd Mon Sep 17 00:00:00 2001 From: Bongsu Jeon Date: Tue, 19 Jan 2021 05:55:22 +0900 Subject: [PATCH 227/241] net: nfc: nci: fix the wrong NCI_CORE_INIT parameters Fix the code because NCI_CORE_INIT_CMD includes two parameters in NCI2.0 but there is no parameters in NCI1.x. Fixes: bcd684aace34 ("net/nfc/nci: Support NCI 2.x initial sequence") Signed-off-by: Bongsu Jeon Link: https://lore.kernel.org/r/20210118205522.317087-1-bongsu.jeon@samsung.com Signed-off-by: Jakub Kicinski --- net/nfc/nci/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index e64727e1a72f..02a1f13f0798 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -508,7 +508,7 @@ static int nci_open_device(struct nci_dev *ndev) }; unsigned long opt = 0; - if (!(ndev->nci_ver & NCI_VER_2_MASK)) + if (ndev->nci_ver & NCI_VER_2_MASK) opt = (unsigned long)&nci_init_v2_cmd; rc = __nci_request(ndev, nci_init_req, opt, From fd23d2dc180fccfad4b27a8e52ba1bc415d18509 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 19 Jan 2021 10:59:30 +0800 Subject: [PATCH 228/241] selftests: net: fib_tests: remove duplicate log test The previous test added an address with a specified metric and check if correspond route was created. I somehow added two logs for the same test. Remove the duplicated one. Reported-by: Antoine Tenart Fixes: 0d29169a708b ("selftests/net/fib_tests: update addr_metric_test for peer route testing") Signed-off-by: Hangbin Liu Reviewed-by: David Ahern Link: https://lore.kernel.org/r/20210119025930.2810532-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/fib_tests.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/net/fib_tests.sh b/tools/testing/selftests/net/fib_tests.sh index 84205c3a55eb..2b5707738609 100755 --- a/tools/testing/selftests/net/fib_tests.sh +++ b/tools/testing/selftests/net/fib_tests.sh @@ -1055,7 +1055,6 @@ ipv6_addr_metric_test() check_route6 "2001:db8:104::1 dev dummy2 proto kernel metric 260" log_test $? 0 "Set metric with peer route on local side" - log_test $? 0 "User specified metric on local address" check_route6 "2001:db8:104::2 dev dummy2 proto kernel metric 260" log_test $? 0 "Set metric with peer route on peer side" From b160c28548bc0a87cbd16d5af6d3edcfd70b8c9a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Jan 2021 08:49:00 -0800 Subject: [PATCH 229/241] tcp: do not mess with cloned skbs in tcp_add_backlog() Heiner Kallweit reported that some skbs were sent with the following invalid GSO properties : - gso_size > 0 - gso_type == 0 This was triggerring a WARN_ON_ONCE() in rtl8169_tso_csum_v2. Juerg Haefliger was able to reproduce a similar issue using a lan78xx NIC and a workload mixing TCP incoming traffic and forwarded packets. The problem is that tcp_add_backlog() is writing over gso_segs and gso_size even if the incoming packet will not be coalesced to the backlog tail packet. While skb_try_coalesce() would bail out if tail packet is cloned, this overwriting would lead to corruptions of other packets cooked by lan78xx, sharing a common super-packet. The strategy used by lan78xx is to use a big skb, and split it into all received packets using skb_clone() to avoid copies. The drawback of this strategy is that all the small skb share a common struct skb_shared_info. This patch rewrites TCP gso_size/gso_segs handling to only happen on the tail skb, since skb_try_coalesce() made sure it was not cloned. Fixes: 4f693b55c3d2 ("tcp: implement coalescing on backlog queue") Signed-off-by: Eric Dumazet Bisected-by: Juerg Haefliger Tested-by: Juerg Haefliger Reported-by: Heiner Kallweit Link: https://bugzilla.kernel.org/show_bug.cgi?id=209423 Link: https://lore.kernel.org/r/20210119164900.766957-1-eric.dumazet@gmail.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 58207c7769d0..4e82745d336f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1760,6 +1760,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) { u32 limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf); + u32 tail_gso_size, tail_gso_segs; struct skb_shared_info *shinfo; const struct tcphdr *th; struct tcphdr *thtail; @@ -1767,6 +1768,7 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) unsigned int hdrlen; bool fragstolen; u32 gso_segs; + u32 gso_size; int delta; /* In case all data was pulled from skb frags (in __pskb_pull_tail()), @@ -1792,13 +1794,6 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) */ th = (const struct tcphdr *)skb->data; hdrlen = th->doff * 4; - shinfo = skb_shinfo(skb); - - if (!shinfo->gso_size) - shinfo->gso_size = skb->len - hdrlen; - - if (!shinfo->gso_segs) - shinfo->gso_segs = 1; tail = sk->sk_backlog.tail; if (!tail) @@ -1821,6 +1816,15 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) goto no_coalesce; __skb_pull(skb, hdrlen); + + shinfo = skb_shinfo(skb); + gso_size = shinfo->gso_size ?: skb->len; + gso_segs = shinfo->gso_segs ?: 1; + + shinfo = skb_shinfo(tail); + tail_gso_size = shinfo->gso_size ?: (tail->len - hdrlen); + tail_gso_segs = shinfo->gso_segs ?: 1; + if (skb_try_coalesce(tail, skb, &fragstolen, &delta)) { TCP_SKB_CB(tail)->end_seq = TCP_SKB_CB(skb)->end_seq; @@ -1847,11 +1851,8 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } /* Not as strict as GRO. We only need to carry mss max value */ - skb_shinfo(tail)->gso_size = max(shinfo->gso_size, - skb_shinfo(tail)->gso_size); - - gso_segs = skb_shinfo(tail)->gso_segs + shinfo->gso_segs; - skb_shinfo(tail)->gso_segs = min_t(u32, gso_segs, 0xFFFF); + shinfo->gso_size = max(gso_size, tail_gso_size); + shinfo->gso_segs = min_t(u32, gso_segs + tail_gso_segs, 0xFFFF); sk->sk_backlog.len += delta; __NET_INC_STATS(sock_net(sk), From 8e4052c32d6b4b39c1e13c652c7e33748d447409 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 19 Jan 2021 17:48:03 +0300 Subject: [PATCH 230/241] net: dsa: b53: fix an off by one in checking "vlan->vid" The > comparison should be >= to prevent accessing one element beyond the end of the dev->vlans[] array in the caller function, b53_vlan_add(). The "dev->vlans" array is allocated in the b53_switch_init() function and it has "dev->num_vlans" elements. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Dan Carpenter Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/YAbxI97Dl/pmBy5V@mwanda Signed-off-by: Jakub Kicinski --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 288b5a5c3e0d..95c7fa171e35 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1404,7 +1404,7 @@ int b53_vlan_prepare(struct dsa_switch *ds, int port, !(vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED)) return -EINVAL; - if (vlan->vid_end > dev->num_vlans) + if (vlan->vid_end >= dev->num_vlans) return -ERANGE; b53_enable_vlan(dev, true, ds->vlan_filtering); From 9c30ae8398b0813e237bde387d67a7f74ab2db2d Mon Sep 17 00:00:00 2001 From: Yuchung Cheng Date: Tue, 19 Jan 2021 11:26:19 -0800 Subject: [PATCH 231/241] tcp: fix TCP socket rehash stats mis-accounting The previous commit 32efcc06d2a1 ("tcp: export count for rehash attempts") would mis-account rehashing SNMP and socket stats: a. During handshake of an active open, only counts the first SYN timeout b. After handshake of passive and active open, stop updating after (roughly) TCP_RETRIES1 recurring RTOs c. After the socket aborts, over count timeout_rehash by 1 This patch fixes this by checking the rehash result from sk_rethink_txhash. Fixes: 32efcc06d2a1 ("tcp: export count for rehash attempts") Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Link: https://lore.kernel.org/r/20210119192619.1848270-1-ycheng@google.com Signed-off-by: Jakub Kicinski --- include/net/sock.h | 17 ++++++++++++----- net/ipv4/tcp_input.c | 5 ++--- net/ipv4/tcp_timer.c | 22 ++++++++-------------- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index bdc4323ce53c..129d200bccb4 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1921,10 +1921,13 @@ static inline void sk_set_txhash(struct sock *sk) sk->sk_txhash = net_tx_rndhash(); } -static inline void sk_rethink_txhash(struct sock *sk) +static inline bool sk_rethink_txhash(struct sock *sk) { - if (sk->sk_txhash) + if (sk->sk_txhash) { sk_set_txhash(sk); + return true; + } + return false; } static inline struct dst_entry * @@ -1947,12 +1950,10 @@ sk_dst_get(struct sock *sk) return dst; } -static inline void dst_negative_advice(struct sock *sk) +static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); - sk_rethink_txhash(sk); - if (dst && dst->ops->negative_advice) { ndst = dst->ops->negative_advice(dst); @@ -1964,6 +1965,12 @@ static inline void dst_negative_advice(struct sock *sk) } } +static inline void dst_negative_advice(struct sock *sk) +{ + sk_rethink_txhash(sk); + __dst_negative_advice(sk); +} + static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bafcab75f425..a7dfca0a38cd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4397,10 +4397,9 @@ static void tcp_rcv_spurious_retrans(struct sock *sk, const struct sk_buff *skb) * The receiver remembers and reflects via DSACKs. Leverage the * DSACK state and change the txhash to re-route speculatively. */ - if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq) { - sk_rethink_txhash(sk); + if (TCP_SKB_CB(skb)->seq == tcp_sk(sk)->duplicate_sack[0].start_seq && + sk_rethink_txhash(sk)) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDUPLICATEDATAREHASH); - } } static void tcp_send_dupack(struct sock *sk, const struct sk_buff *skb) diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 454732ecc8f3..faa92948441b 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -219,14 +219,8 @@ static int tcp_write_timeout(struct sock *sk) int retry_until; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { - if (icsk->icsk_retransmits) { - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); - } + if (icsk->icsk_retransmits) + __dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries; expired = icsk->icsk_retransmits >= retry_until; } else { @@ -234,12 +228,7 @@ static int tcp_write_timeout(struct sock *sk) /* Black hole detection */ tcp_mtu_probing(icsk, sk); - dst_negative_advice(sk); - } else { - sk_rethink_txhash(sk); - tp->timeout_rehash++; - __NET_INC_STATS(sock_net(sk), - LINUX_MIB_TCPTIMEOUTREHASH); + __dst_negative_advice(sk); } retry_until = net->ipv4.sysctl_tcp_retries2; @@ -270,6 +259,11 @@ static int tcp_write_timeout(struct sock *sk) return 1; } + if (sk_rethink_txhash(sk)) { + tp->timeout_rehash++; + __NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEOUTREHASH); + } + return 0; } From 03f16c5075b22c8902d2af739969e878b0879c94 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 20 Jan 2021 20:41:35 +0900 Subject: [PATCH 232/241] can: dev: can_restart: fix use after free bug After calling netif_rx_ni(skb), dereferencing skb is unsafe. Especially, the can_frame cf which aliases skb memory is accessed after the netif_rx_ni() in: stats->rx_bytes += cf->len; Reordering the lines solves the issue. Fixes: 39549eef3587 ("can: CAN Network device driver and Netlink interface") Link: https://lore.kernel.org/r/20210120114137.200019-2-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index 3486704c8a95..8b1ae023cb21 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -592,11 +592,11 @@ static void can_restart(struct net_device *dev) cf->can_id |= CAN_ERR_RESTARTED; - netif_rx_ni(skb); - stats->rx_packets++; stats->rx_bytes += cf->len; + netif_rx_ni(skb); + restart: netdev_dbg(dev, "restarted\n"); priv->can_stats.restarts++; From 75854cad5d80976f6ea0f0431f8cedd3bcc475cb Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 20 Jan 2021 20:41:36 +0900 Subject: [PATCH 233/241] can: vxcan: vxcan_xmit: fix use after free bug After calling netif_rx_ni(skb), dereferencing skb is unsafe. Especially, the canfd_frame cfd which aliases skb memory is accessed after the netif_rx_ni(). Fixes: a8f820a380a2 ("can: add Virtual CAN Tunnel driver (vxcan)") Link: https://lore.kernel.org/r/20210120114137.200019-3-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/vxcan.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/vxcan.c b/drivers/net/can/vxcan.c index fa47bab510bb..f9a524c5f6d6 100644 --- a/drivers/net/can/vxcan.c +++ b/drivers/net/can/vxcan.c @@ -39,6 +39,7 @@ static netdev_tx_t vxcan_xmit(struct sk_buff *skb, struct net_device *dev) struct net_device *peer; struct canfd_frame *cfd = (struct canfd_frame *)skb->data; struct net_device_stats *peerstats, *srcstats = &dev->stats; + u8 len; if (can_dropped_invalid_skb(dev, skb)) return NETDEV_TX_OK; @@ -61,12 +62,13 @@ static netdev_tx_t vxcan_xmit(struct sk_buff *skb, struct net_device *dev) skb->dev = peer; skb->ip_summed = CHECKSUM_UNNECESSARY; + len = cfd->len; if (netif_rx_ni(skb) == NET_RX_SUCCESS) { srcstats->tx_packets++; - srcstats->tx_bytes += cfd->len; + srcstats->tx_bytes += len; peerstats = &peer->stats; peerstats->rx_packets++; - peerstats->rx_bytes += cfd->len; + peerstats->rx_bytes += len; } out_unlock: From 50aca891d7a554db0901b245167cd653d73aaa71 Mon Sep 17 00:00:00 2001 From: Vincent Mailhol Date: Wed, 20 Jan 2021 20:41:37 +0900 Subject: [PATCH 234/241] can: peak_usb: fix use after free bugs After calling peak_usb_netif_rx_ni(skb), dereferencing skb is unsafe. Especially, the can_frame cf which aliases skb memory is accessed after the peak_usb_netif_rx_ni(). Reordering the lines solves the issue. Fixes: 0a25e1f4f185 ("can: peak_usb: add support for PEAK new CANFD USB adapters") Link: https://lore.kernel.org/r/20210120114137.200019-4-mailhol.vincent@wanadoo.fr Signed-off-by: Vincent Mailhol Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/peak_usb/pcan_usb_fd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c index 61631f4fd92a..f347ecc79aef 100644 --- a/drivers/net/can/usb/peak_usb/pcan_usb_fd.c +++ b/drivers/net/can/usb/peak_usb/pcan_usb_fd.c @@ -514,11 +514,11 @@ static int pcan_usb_fd_decode_canmsg(struct pcan_usb_fd_if *usb_if, else memcpy(cfd->data, rm->d, cfd->len); - peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(rm->ts_low)); - netdev->stats.rx_packets++; netdev->stats.rx_bytes += cfd->len; + peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(rm->ts_low)); + return 0; } @@ -580,11 +580,11 @@ static int pcan_usb_fd_decode_status(struct pcan_usb_fd_if *usb_if, if (!skb) return -ENOMEM; - peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(sm->ts_low)); - netdev->stats.rx_packets++; netdev->stats.rx_bytes += cf->len; + peak_usb_netif_rx(skb, &usb_if->time_ref, le32_to_cpu(sm->ts_low)); + return 0; } From bc895e8b2a64e502fbba72748d59618272052a8b Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 20 Jan 2021 00:24:24 +0100 Subject: [PATCH 235/241] bpf: Fix signed_{sub,add32}_overflows type handling Fix incorrect signed_{sub,add32}_overflows() input types (and a related buggy comment). It looks like this might have slipped in via copy/paste issue, also given prior to 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") the signature of signed_sub_overflows() had s64 a and s64 b as its input args whereas now they are truncated to s32. Thus restore proper types. Also, the case of signed_add32_overflows() is not consistent to signed_sub32_overflows(). Both have s32 as inputs, therefore align the former. Fixes: 3f50f132d840 ("bpf: Verifier, do explicit ALU32 bounds tracking") Reported-by: De4dCr0w Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Acked-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 36af69fac591..e7368c5eacb7 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5313,7 +5313,7 @@ static bool signed_add_overflows(s64 a, s64 b) return res < a; } -static bool signed_add32_overflows(s64 a, s64 b) +static bool signed_add32_overflows(s32 a, s32 b) { /* Do the add in u32, where overflow is well-defined */ s32 res = (s32)((u32)a + (u32)b); @@ -5323,7 +5323,7 @@ static bool signed_add32_overflows(s64 a, s64 b) return res < a; } -static bool signed_sub_overflows(s32 a, s32 b) +static bool signed_sub_overflows(s64 a, s64 b) { /* Do the sub in u64, where overflow is well-defined */ s64 res = (s64)((u64)a - (u64)b); @@ -5335,7 +5335,7 @@ static bool signed_sub_overflows(s32 a, s32 b) static bool signed_sub32_overflows(s32 a, s32 b) { - /* Do the sub in u64, where overflow is well-defined */ + /* Do the sub in u32, where overflow is well-defined */ s32 res = (s32)((u32)a - (u32)b); if (b < 0) From c89dffc70b340780e5b933832d8c3e045ef3791e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 18 Jan 2021 14:59:20 +0900 Subject: [PATCH 236/241] tcp: Fix potential use-after-free due to double kfree() Receiving ACK with a valid SYN cookie, cookie_v4_check() allocates struct request_sock and then can allocate inet_rsk(req)->ireq_opt. After that, tcp_v4_syn_recv_sock() allocates struct sock and copies ireq_opt to inet_sk(sk)->inet_opt. Normally, tcp_v4_syn_recv_sock() inserts the full socket into ehash and sets NULL to ireq_opt. Otherwise, tcp_v4_syn_recv_sock() has to reset inet_opt by NULL and free the full socket. The commit 01770a1661657 ("tcp: fix race condition when creating child sockets from syncookies") added a new path, in which more than one cores create full sockets for the same SYN cookie. Currently, the core which loses the race frees the full socket without resetting inet_opt, resulting in that both sock_put() and reqsk_put() call kfree() for the same memory: sock_put sk_free __sk_free sk_destruct __sk_destruct sk->sk_destruct/inet_sock_destruct kfree(rcu_dereference_protected(inet->inet_opt, 1)); reqsk_put reqsk_free __reqsk_free req->rsk_ops->destructor/tcp_v4_reqsk_destructor kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1)); Calling kmalloc() between the double kfree() can lead to use-after-free, so this patch fixes it by setting NULL to inet_opt before sock_put(). As a side note, this kind of issue does not happen for IPv6. This is because tcp_v6_syn_recv_sock() clones both ipv6_opt and pktopts which correspond to ireq_opt in IPv4. Fixes: 01770a166165 ("tcp: fix race condition when creating child sockets from syncookies") CC: Ricardo Dias Signed-off-by: Kuniyuki Iwashima Reviewed-by: Benjamin Herrenschmidt Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20210118055920.82516-1-kuniyu@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4e82745d336f..777306b5bc22 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1595,6 +1595,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, tcp_move_syn(newtp, req); ireq->ireq_opt = NULL; } else { + newinet->inet_opt = NULL; + if (!req_unhash && found_dup_sk) { /* This code path should only be executed in the * syncookie case only @@ -1602,8 +1604,6 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, bh_unlock_sock(newsk); sock_put(newsk); newsk = NULL; - } else { - newinet->inet_opt = NULL; } } return newsk; From 584b7cfcdc7d6d416a9d6fece9516764bd977d2e Mon Sep 17 00:00:00 2001 From: Alban Bedel Date: Tue, 19 Jan 2021 15:06:38 +0100 Subject: [PATCH 237/241] net: mscc: ocelot: Fix multicast to the CPU port Multicast entries in the MAC table use the high bits of the MAC address to encode the ports that should get the packets. But this port mask does not work for the CPU port, to receive these packets on the CPU port the MAC_CPU_COPY flag must be set. Because of this IPv6 was effectively not working because neighbor solicitations were never received. This was not apparent before commit 9403c158 (net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries) as the IPv6 entries were broken so all incoming IPv6 multicast was then treated as unknown and flooded on all ports. To fix this problem rework the ocelot_mact_learn() to set the MAC_CPU_COPY flag when a multicast entry that target the CPU port is added. For this we have to read back the ports endcoded in the pseudo MAC address by the caller. It is not a very nice design but that avoid changing the callers and should make backporting easier. Signed-off-by: Alban Bedel Fixes: 9403c158b872 ("net: mscc: ocelot: support IPv4, IPv6 and plain Ethernet mdb entries") Link: https://lore.kernel.org/r/20210119140638.203374-1-alban.bedel@aerq.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mscc/ocelot.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 0b9992bd6626..ff87a0bc089c 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -60,14 +60,27 @@ int ocelot_mact_learn(struct ocelot *ocelot, int port, const unsigned char mac[ETH_ALEN], unsigned int vid, enum macaccess_entry_type type) { + u32 cmd = ANA_TABLES_MACACCESS_VALID | + ANA_TABLES_MACACCESS_DEST_IDX(port) | + ANA_TABLES_MACACCESS_ENTRYTYPE(type) | + ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN); + unsigned int mc_ports; + + /* Set MAC_CPU_COPY if the CPU port is used by a multicast entry */ + if (type == ENTRYTYPE_MACv4) + mc_ports = (mac[1] << 8) | mac[2]; + else if (type == ENTRYTYPE_MACv6) + mc_ports = (mac[0] << 8) | mac[1]; + else + mc_ports = 0; + + if (mc_ports & BIT(ocelot->num_phys_ports)) + cmd |= ANA_TABLES_MACACCESS_MAC_CPU_COPY; + ocelot_mact_select(ocelot, mac, vid); /* Issue a write command */ - ocelot_write(ocelot, ANA_TABLES_MACACCESS_VALID | - ANA_TABLES_MACACCESS_DEST_IDX(port) | - ANA_TABLES_MACACCESS_ENTRYTYPE(type) | - ANA_TABLES_MACACCESS_MAC_TABLE_CMD(MACACCESS_CMD_LEARN), - ANA_TABLES_MACACCESS); + ocelot_write(ocelot, cmd, ANA_TABLES_MACACCESS); return ocelot_mact_wait_for_completion(ocelot); } From de658a195ee23ca6aaffe197d1d2ea040beea0a2 Mon Sep 17 00:00:00 2001 From: Grant Grundler Date: Tue, 19 Jan 2021 17:12:08 -0800 Subject: [PATCH 238/241] net: usb: cdc_ncm: don't spew notifications RTL8156 sends notifications about every 32ms. Only display/log notifications when something changes. This issue has been reported by others: https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1832472 https://lkml.org/lkml/2020/8/27/1083 ... [785962.779840] usb 1-1: new high-speed USB device number 5 using xhci_hcd [785962.929944] usb 1-1: New USB device found, idVendor=0bda, idProduct=8156, bcdDevice=30.00 [785962.929949] usb 1-1: New USB device strings: Mfr=1, Product=2, SerialNumber=6 [785962.929952] usb 1-1: Product: USB 10/100/1G/2.5G LAN [785962.929954] usb 1-1: Manufacturer: Realtek [785962.929956] usb 1-1: SerialNumber: 000000001 [785962.991755] usbcore: registered new interface driver cdc_ether [785963.017068] cdc_ncm 1-1:2.0: MAC-Address: 00:24:27:88:08:15 [785963.017072] cdc_ncm 1-1:2.0: setting rx_max = 16384 [785963.017169] cdc_ncm 1-1:2.0: setting tx_max = 16384 [785963.017682] cdc_ncm 1-1:2.0 usb0: register 'cdc_ncm' at usb-0000:00:14.0-1, CDC NCM, 00:24:27:88:08:15 [785963.019211] usbcore: registered new interface driver cdc_ncm [785963.023856] usbcore: registered new interface driver cdc_wdm [785963.025461] usbcore: registered new interface driver cdc_mbim [785963.038824] cdc_ncm 1-1:2.0 enx002427880815: renamed from usb0 [785963.089586] cdc_ncm 1-1:2.0 enx002427880815: network connection: disconnected [785963.121673] cdc_ncm 1-1:2.0 enx002427880815: network connection: disconnected [785963.153682] cdc_ncm 1-1:2.0 enx002427880815: network connection: disconnected ... This is about 2KB per second and will overwrite all contents of a 1MB dmesg buffer in under 10 minutes rendering them useless for debugging many kernel problems. This is also an extra 180 MB/day in /var/logs (or 1GB per week) rendering the majority of those logs useless too. When the link is up (expected state), spew amount is >2x higher: ... [786139.600992] cdc_ncm 2-1:2.0 enx002427880815: network connection: connected [786139.632997] cdc_ncm 2-1:2.0 enx002427880815: 2500 mbit/s downlink 2500 mbit/s uplink [786139.665097] cdc_ncm 2-1:2.0 enx002427880815: network connection: connected [786139.697100] cdc_ncm 2-1:2.0 enx002427880815: 2500 mbit/s downlink 2500 mbit/s uplink [786139.729094] cdc_ncm 2-1:2.0 enx002427880815: network connection: connected [786139.761108] cdc_ncm 2-1:2.0 enx002427880815: 2500 mbit/s downlink 2500 mbit/s uplink ... Chrome OS cannot support RTL8156 until this is fixed. Signed-off-by: Grant Grundler Reviewed-by: Hayes Wang Link: https://lore.kernel.org/r/20210120011208.3768105-1-grundler@chromium.org Signed-off-by: Jakub Kicinski --- drivers/net/usb/cdc_ncm.c | 12 +++++++++++- include/linux/usb/usbnet.h | 2 ++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ncm.c b/drivers/net/usb/cdc_ncm.c index 5a78848db93f..291e76d32abe 100644 --- a/drivers/net/usb/cdc_ncm.c +++ b/drivers/net/usb/cdc_ncm.c @@ -1827,6 +1827,15 @@ cdc_ncm_speed_change(struct usbnet *dev, uint32_t rx_speed = le32_to_cpu(data->DLBitRRate); uint32_t tx_speed = le32_to_cpu(data->ULBitRate); + /* if the speed hasn't changed, don't report it. + * RTL8156 shipped before 2021 sends notification about every 32ms. + */ + if (dev->rx_speed == rx_speed && dev->tx_speed == tx_speed) + return; + + dev->rx_speed = rx_speed; + dev->tx_speed = tx_speed; + /* * Currently the USB-NET API does not support reporting the actual * device speed. Do print it instead. @@ -1867,7 +1876,8 @@ static void cdc_ncm_status(struct usbnet *dev, struct urb *urb) * USB_CDC_NOTIFY_NETWORK_CONNECTION notification shall be * sent by device after USB_CDC_NOTIFY_SPEED_CHANGE. */ - usbnet_link_change(dev, !!event->wValue, 0); + if (netif_carrier_ok(dev->net) != !!event->wValue) + usbnet_link_change(dev, !!event->wValue, 0); break; case USB_CDC_NOTIFY_SPEED_CHANGE: diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 88a7673894d5..cfbfd6fe01df 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -81,6 +81,8 @@ struct usbnet { # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 # define EVENT_NO_IP_ALIGN 13 + u32 rx_speed; /* in bps - NOT Mbps */ + u32 tx_speed; /* in bps - NOT Mbps */ }; static inline struct usb_driver *driver_of(struct usb_interface *intf) From 0c630a66bf10991b0ef13d27c93d7545e692ef5b Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Tue, 19 Jan 2021 20:44:23 -0800 Subject: [PATCH 239/241] net: systemport: free dev before on error path On the error path, it should goto the error handling label to free allocated memory rather than directly return. Fixes: 31bc72d97656 ("net: systemport: fetch and use clock resources") Signed-off-by: Pan Bian Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20210120044423.1704-1-bianpan2016@163.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/broadcom/bcmsysport.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/broadcom/bcmsysport.c b/drivers/net/ethernet/broadcom/bcmsysport.c index b1ae9eb8f247..0404aafd5ce5 100644 --- a/drivers/net/ethernet/broadcom/bcmsysport.c +++ b/drivers/net/ethernet/broadcom/bcmsysport.c @@ -2503,8 +2503,10 @@ static int bcm_sysport_probe(struct platform_device *pdev) priv = netdev_priv(dev); priv->clk = devm_clk_get_optional(&pdev->dev, "sw_sysport"); - if (IS_ERR(priv->clk)) - return PTR_ERR(priv->clk); + if (IS_ERR(priv->clk)) { + ret = PTR_ERR(priv->clk); + goto err_free_netdev; + } /* Allocate number of TX rings */ priv->tx_rings = devm_kcalloc(&pdev->dev, txq, From db58465f1121086b524be80be39d1fedbe5387f3 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 20 Jan 2021 16:11:12 +0000 Subject: [PATCH 240/241] cachefiles: Drop superfluous readpages aops NULL check After the recent actions to convert readpages aops to readahead, the NULL checks of readpages aops in cachefiles_read_or_alloc_page() may hit falsely. More badly, it's an ASSERT() call, and this panics. Drop the superfluous NULL checks for fixing this regression. [DH: Note that cachefiles never actually used readpages, so this check was never actually necessary] BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=208883 BugLink: https://bugzilla.opensuse.org/show_bug.cgi?id=1175245 Fixes: 9ae326a69004 ("CacheFiles: A cache that backs onto a mounted filesystem") Signed-off-by: Takashi Iwai Signed-off-by: David Howells Acked-by: Matthew Wilcox (Oracle) Signed-off-by: Linus Torvalds --- fs/cachefiles/rdwr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 8bda092e60c5..e027c718ca01 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -413,7 +413,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); - ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; @@ -713,7 +712,6 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, inode = d_backing_inode(object->backer); ASSERT(S_ISREG(inode->i_mode)); - ASSERT(inode->i_mapping->a_ops->readpages); /* calculate the shift required to use bmap */ shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; From 7178a107f5ea7bdb1cc23073234f0ded0ef90ec7 Mon Sep 17 00:00:00 2001 From: Tianjia Zhang Date: Tue, 19 Jan 2021 00:13:19 +0000 Subject: [PATCH 241/241] X.509: Fix crash caused by NULL pointer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On the following call path, `sig->pkey_algo` is not assigned in asymmetric_key_verify_signature(), which causes runtime crash in public_key_verify_signature(). keyctl_pkey_verify asymmetric_key_verify_signature verify_signature public_key_verify_signature This patch simply check this situation and fixes the crash caused by NULL pointer. Fixes: 215525639631 ("X.509: support OSCCA SM2-with-SM3 certificate verification") Reported-by: Tobias Markus Signed-off-by: Tianjia Zhang Signed-off-by: David Howells Reviewed-and-tested-by: Toke Høiland-Jørgensen Tested-by: João Fonseca Acked-by: Jarkko Sakkinen Cc: stable@vger.kernel.org # v5.10+ Signed-off-by: Linus Torvalds --- crypto/asymmetric_keys/public_key.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/crypto/asymmetric_keys/public_key.c b/crypto/asymmetric_keys/public_key.c index 8892908ad58c..788a4ba1e2e7 100644 --- a/crypto/asymmetric_keys/public_key.c +++ b/crypto/asymmetric_keys/public_key.c @@ -356,7 +356,8 @@ int public_key_verify_signature(const struct public_key *pkey, if (ret) goto error_free_key; - if (strcmp(sig->pkey_algo, "sm2") == 0 && sig->data_size) { + if (sig->pkey_algo && strcmp(sig->pkey_algo, "sm2") == 0 && + sig->data_size) { ret = cert_sig_digest_update(sig, tfm); if (ret) goto error_free_key;