From 89aa593010135660991d05c92528c2c9163d5900 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 8 Sep 2017 15:23:18 +0800 Subject: [PATCH 01/18] ceph: keep auth cap when inode has flocks or posix locks file locks are tracked by inode's auth mds. dropping auth caps is equivalent to releasing all file locks. Signed-off-by: "Yan, Zheng" Acked-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 1 + fs/ceph/locks.c | 62 +++++++++++++++++++++++++++++++++++++------- fs/ceph/mds_client.c | 5 ++++ fs/ceph/super.h | 1 + 4 files changed, 60 insertions(+), 9 deletions(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f2550a076edc..6301bf299b7c 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -493,6 +493,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_wb_ref = 0; ci->i_wrbuffer_ref = 0; ci->i_wrbuffer_ref_head = 0; + atomic_set(&ci->i_filelock_ref, 0); ci->i_shared_gen = 0; ci->i_rdcache_gen = 0; ci->i_rdcache_revoking = 0; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index e7cce412f2cf..316d550b9603 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -30,19 +30,46 @@ void __init ceph_flock_init(void) get_random_bytes(&lock_secret, sizeof(lock_secret)); } +static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) +{ + struct inode *inode = file_inode(src->fl_file); + atomic_inc(&ceph_inode(inode)->i_filelock_ref); +} + +static void ceph_fl_release_lock(struct file_lock *fl) +{ + struct inode *inode = file_inode(fl->fl_file); + atomic_dec(&ceph_inode(inode)->i_filelock_ref); +} + +static const struct file_lock_operations ceph_fl_lock_ops = { + .fl_copy_lock = ceph_fl_copy_lock, + .fl_release_private = ceph_fl_release_lock, +}; + /** * Implement fcntl and flock locking functions. */ -static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file, +static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { - struct inode *inode = file_inode(file); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_mds_request *req; int err; u64 length = 0; u64 owner; + if (operation == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, auth caps may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ceph_inode(inode)->i_filelock_ref); + } + if (operation != CEPH_MDS_OP_SETFILELOCK || cmd == CEPH_LOCK_UNLOCK) wait = 0; @@ -180,10 +207,11 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, */ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; + struct inode *inode = file_inode(file); int err; - u8 wait = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; + u8 lock_cmd; + u8 wait = 0; if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; @@ -199,6 +227,17 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else if (IS_SETLKW(cmd)) wait = 1; + if (op == CEPH_MDS_OP_SETFILELOCK) { + /* + * increasing i_filelock_ref closes race window between + * handling request reply and adding file_lock struct to + * inode. Otherwise, i_auth_cap may get trimmed in the + * window. Caller function will decrease the counter. + */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ceph_inode(inode)->i_filelock_ref); + } + if (F_RDLCK == fl->fl_type) lock_cmd = CEPH_LOCK_SHARED; else if (F_WRLCK == fl->fl_type) @@ -206,7 +245,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else lock_cmd = CEPH_LOCK_UNLOCK; - err = ceph_lock_message(CEPH_LOCK_FCNTL, op, file, lock_cmd, wait, fl); + err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { if (op != CEPH_MDS_OP_GETFILELOCK) { dout("mds locked, locking locally"); @@ -215,7 +254,7 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) /* undo! This should only happen if * the kernel detects local * deadlock. */ - ceph_lock_message(CEPH_LOCK_FCNTL, op, file, + ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on posix_lock_file, undid lock", err); @@ -227,8 +266,9 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { - u8 lock_cmd; + struct inode *inode = file_inode(file); int err; + u8 lock_cmd; u8 wait = 0; if (!(fl->fl_flags & FL_FLOCK)) @@ -239,6 +279,10 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) dout("ceph_flock, fl_file: %p", fl->fl_file); + /* see comment in ceph_lock */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ceph_inode(inode)->i_filelock_ref); + if (IS_SETLKW(cmd)) wait = 1; @@ -250,13 +294,13 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) lock_cmd = CEPH_LOCK_UNLOCK; err = ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, lock_cmd, wait, fl); + inode, lock_cmd, wait, fl); if (!err) { err = locks_lock_file_wait(file, fl); if (err) { ceph_lock_message(CEPH_LOCK_FLOCK, CEPH_MDS_OP_SETFILELOCK, - file, CEPH_LOCK_UNLOCK, 0, fl); + inode, CEPH_LOCK_UNLOCK, 0, fl); dout("got %d on locks_lock_file_wait, undid lock", err); } } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 0687ab3c3267..c8a811db387a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1462,6 +1462,11 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; + /* Note: it's possible that i_filelock_ref becomes non-zero + * after dropping auth caps. It doesn't hurt because reply + * of lock mds request will re-add auth caps. */ + if (atomic_read(&ci->i_filelock_ref) > 0) + goto out; } /* The inode has cached pages, but it's no longer used. * we can safely drop it */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 3e27a28aa44a..100596c49353 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -352,6 +352,7 @@ struct ceph_inode_info { int i_pin_ref; int i_rd_ref, i_rdcache_ref, i_wr_ref, i_wb_ref; int i_wrbuffer_ref, i_wrbuffer_ref_head; + atomic_t i_filelock_ref; u32 i_shared_gen; /* increment each time we get FILE_SHARED */ u32 i_rdcache_gen; /* incremented each time we get FILE_CACHE. */ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */ From c6db84723363790160a89dee4554ad2f0687a0c5 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Sep 2017 09:58:56 +0800 Subject: [PATCH 02/18] ceph: make lock_to_ceph_filelock() static Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 62 ++++++++++++++++++++++++------------------------- fs/ceph/super.h | 1 - 2 files changed, 31 insertions(+), 32 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 316d550b9603..2927f3bc2fc9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -332,6 +332,37 @@ void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count) *flock_count, *fcntl_count); } +/* + * Given a pointer to a lock, convert it to a ceph filelock + */ +static int lock_to_ceph_filelock(struct file_lock *lock, + struct ceph_filelock *cephlock) +{ + int err = 0; + cephlock->start = cpu_to_le64(lock->fl_start); + cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); + cephlock->client = cpu_to_le64(0); + cephlock->pid = cpu_to_le64((u64)lock->fl_pid); + cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); + + switch (lock->fl_type) { + case F_RDLCK: + cephlock->type = CEPH_LOCK_SHARED; + break; + case F_WRLCK: + cephlock->type = CEPH_LOCK_EXCL; + break; + case F_UNLCK: + cephlock->type = CEPH_LOCK_UNLOCK; + break; + default: + dout("Have unknown lock type %d", lock->fl_type); + err = -EINVAL; + } + + return err; +} + /** * Encode the flock and fcntl locks for the given inode into the ceph_filelock * array. Must be called with inode->i_lock already held. @@ -416,34 +447,3 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks, out_fail: return err; } - -/* - * Given a pointer to a lock, convert it to a ceph filelock - */ -int lock_to_ceph_filelock(struct file_lock *lock, - struct ceph_filelock *cephlock) -{ - int err = 0; - cephlock->start = cpu_to_le64(lock->fl_start); - cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1); - cephlock->client = cpu_to_le64(0); - cephlock->pid = cpu_to_le64((u64)lock->fl_pid); - cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner)); - - switch (lock->fl_type) { - case F_RDLCK: - cephlock->type = CEPH_LOCK_SHARED; - break; - case F_WRLCK: - cephlock->type = CEPH_LOCK_EXCL; - break; - case F_UNLCK: - cephlock->type = CEPH_LOCK_UNLOCK; - break; - default: - dout("Have unknown lock type %d", lock->fl_type); - err = -EINVAL; - } - - return err; -} diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 100596c49353..0b2c801f4bbb 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -1012,7 +1012,6 @@ extern int ceph_encode_locks_to_buffer(struct inode *inode, extern int ceph_locks_to_pagelist(struct ceph_filelock *flocks, struct ceph_pagelist *pagelist, int num_fcntl_locks, int num_flock_locks); -extern int lock_to_ceph_filelock(struct file_lock *fl, struct ceph_filelock *c); /* debugfs.c */ extern int ceph_fs_debugfs_init(struct ceph_fs_client *client); From 4deb14a2593dfade102dd94a803a63cf620cfd56 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Sep 2017 10:36:28 +0800 Subject: [PATCH 03/18] ceph: optimize flock encoding during reconnect Don't malloc if there is no flock. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 17 ++++++++++------- fs/ceph/mds_client.c | 34 ++++++++++++++++++++-------------- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 2927f3bc2fc9..aaea82076849 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -431,19 +431,22 @@ int ceph_locks_to_pagelist(struct ceph_filelock *flocks, if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, flocks, - num_fcntl_locks * sizeof(*flocks)); - if (err) - goto out_fail; + if (num_fcntl_locks > 0) { + err = ceph_pagelist_append(pagelist, flocks, + num_fcntl_locks * sizeof(*flocks)); + if (err) + goto out_fail; + } nlocks = cpu_to_le32(num_flock_locks); err = ceph_pagelist_append(pagelist, &nlocks, sizeof(nlocks)); if (err) goto out_fail; - err = ceph_pagelist_append(pagelist, - &flocks[num_fcntl_locks], - num_flock_locks * sizeof(*flocks)); + if (num_flock_locks > 0) { + err = ceph_pagelist_append(pagelist, &flocks[num_fcntl_locks], + num_flock_locks * sizeof(*flocks)); + } out_fail: return err; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index c8a811db387a..295cf5e42ea9 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2899,26 +2899,32 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, if (recon_state->msg_version >= 2) { int num_fcntl_locks, num_flock_locks; - struct ceph_filelock *flocks; + struct ceph_filelock *flocks = NULL; size_t struct_len, total_len = 0; u8 struct_v = 0; encode_again: ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); - flocks = kmalloc((num_fcntl_locks+num_flock_locks) * - sizeof(struct ceph_filelock), GFP_NOFS); - if (!flocks) { - err = -ENOMEM; - goto out_free; - } - err = ceph_encode_locks_to_buffer(inode, flocks, - num_fcntl_locks, - num_flock_locks); - if (err) { + if (num_fcntl_locks + num_flock_locks > 0) { + flocks = kmalloc((num_fcntl_locks + num_flock_locks) * + sizeof(struct ceph_filelock), GFP_NOFS); + if (!flocks) { + err = -ENOMEM; + goto out_free; + } + err = ceph_encode_locks_to_buffer(inode, flocks, + num_fcntl_locks, + num_flock_locks); + if (err) { + kfree(flocks); + flocks = NULL; + if (err == -ENOSPC) + goto encode_again; + goto out_free; + } + } else { kfree(flocks); - if (err == -ENOSPC) - goto encode_again; - goto out_free; + flocks = NULL; } if (recon_state->msg_version >= 3) { From b3f8d68f38a879daed1eab66c0e19bc293096d34 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Mon, 11 Sep 2017 10:58:55 +0800 Subject: [PATCH 04/18] ceph: handle 'session get evicted while there are file locks' When session get evicted, all file locks associated with the session get released remotely by mds. File locks tracked by kernel become stale. In this situation, set an error flag on inode. The flag makes further file locks return -EIO. Another option to handle this situation is cleanup file locks tracked kernel. I do not choose it because it is inconvenient to notify user program about the error. Signed-off-by: "Yan, Zheng" Acked-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/locks.c | 52 ++++++++++++++++++++++++++++++++++---------- fs/ceph/mds_client.c | 21 +++++++++++++----- fs/ceph/super.h | 2 ++ 3 files changed, 58 insertions(+), 17 deletions(-) diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index aaea82076849..9e66f69ee8a5 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -39,7 +39,13 @@ static void ceph_fl_copy_lock(struct file_lock *dst, struct file_lock *src) static void ceph_fl_release_lock(struct file_lock *fl) { struct inode *inode = file_inode(fl->fl_file); - atomic_dec(&ceph_inode(inode)->i_filelock_ref); + struct ceph_inode_info *ci = ceph_inode(inode); + if (atomic_dec_and_test(&ci->i_filelock_ref)) { + /* clear error when all locks are released */ + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags &= ~CEPH_I_ERROR_FILELOCK; + spin_unlock(&ci->i_ceph_lock); + } } static const struct file_lock_operations ceph_fl_lock_ops = { @@ -208,10 +214,11 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, int ceph_lock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(file); - int err; + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; u16 op = CEPH_MDS_OP_SETFILELOCK; - u8 lock_cmd; u8 wait = 0; + u8 lock_cmd; if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; @@ -227,7 +234,10 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) else if (IS_SETLKW(cmd)) wait = 1; - if (op == CEPH_MDS_OP_SETFILELOCK) { + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } else if (op == CEPH_MDS_OP_SETFILELOCK) { /* * increasing i_filelock_ref closes race window between * handling request reply and adding file_lock struct to @@ -235,7 +245,13 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) * window. Caller function will decrease the counter. */ fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + atomic_inc(&ci->i_filelock_ref); + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (op == CEPH_MDS_OP_SETFILELOCK && F_UNLCK == fl->fl_type) + posix_lock_file(file, fl, NULL); + return err; } if (F_RDLCK == fl->fl_type) @@ -247,10 +263,10 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) err = ceph_lock_message(CEPH_LOCK_FCNTL, op, inode, lock_cmd, wait, fl); if (!err) { - if (op != CEPH_MDS_OP_GETFILELOCK) { + if (op == CEPH_MDS_OP_SETFILELOCK) { dout("mds locked, locking locally"); err = posix_lock_file(file, fl, NULL); - if (err && (CEPH_MDS_OP_SETFILELOCK == op)) { + if (err) { /* undo! This should only happen if * the kernel detects local * deadlock. */ @@ -267,9 +283,10 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) int ceph_flock(struct file *file, int cmd, struct file_lock *fl) { struct inode *inode = file_inode(file); - int err; - u8 lock_cmd; + struct ceph_inode_info *ci = ceph_inode(inode); + int err = 0; u8 wait = 0; + u8 lock_cmd; if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; @@ -279,9 +296,20 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) dout("ceph_flock, fl_file: %p", fl->fl_file); - /* see comment in ceph_lock */ - fl->fl_ops = &ceph_fl_lock_ops; - atomic_inc(&ceph_inode(inode)->i_filelock_ref); + spin_lock(&ci->i_ceph_lock); + if (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) { + err = -EIO; + } else { + /* see comment in ceph_lock */ + fl->fl_ops = &ceph_fl_lock_ops; + atomic_inc(&ci->i_filelock_ref); + } + spin_unlock(&ci->i_ceph_lock); + if (err < 0) { + if (F_UNLCK == fl->fl_type) + locks_lock_file_wait(file, fl); + return err; + } if (IS_SETLKW(cmd)) wait = 1; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 295cf5e42ea9..8c8361262ade 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1215,6 +1215,13 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, } spin_unlock(&mdsc->cap_dirty_lock); + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited(" dropping file locks for %p %lld\n", + inode, ceph_ino(inode)); + } + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); ci->i_prealloc_cap_flush = NULL; @@ -2832,7 +2839,7 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_mds_cap_reconnect v2; struct ceph_mds_cap_reconnect_v1 v1; } rec; - struct ceph_inode_info *ci; + struct ceph_inode_info *ci = cap->ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; char *path; @@ -2841,8 +2848,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, u64 snap_follows; struct dentry *dentry; - ci = cap->ci; - dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); @@ -2875,7 +2880,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); - rec.v2.flock_len = 0; + rec.v2.flock_len = + (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1; } else { rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); @@ -2904,7 +2910,12 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, u8 struct_v = 0; encode_again: - ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + if (rec.v2.flock_len) { + ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks); + } else { + num_fcntl_locks = 0; + num_flock_locks = 0; + } if (num_fcntl_locks + num_flock_locks > 0) { flocks = kmalloc((num_fcntl_locks + num_flock_locks) * sizeof(struct ceph_filelock), GFP_NOFS); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0b2c801f4bbb..2beeec07fa76 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -488,6 +488,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */ #define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 11) /* have seen write errors */ +#define CEPH_I_ERROR_FILELOCK (1 << 12) /* have seen file lock errors */ + /* * We set the ERROR_WRITE bit when we start seeing write errors on an inode From 7271efa79f8bc01694d1a9fce597088a97b3b160 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Sat, 7 Oct 2017 16:02:21 +0200 Subject: [PATCH 05/18] ceph: fix bool initialization/comparison Bool initializations should use true and false. Bool tests don't need comparisons. Signed-off-by: Thomas Meyer Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index ff5d32cf9578..05ae1e472547 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1712,7 +1712,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, /* if we are unmounting, flush any unused caps immediately. */ if (mdsc->stopping) - is_delayed = 1; + is_delayed = true; spin_lock(&ci->i_ceph_lock); @@ -3189,8 +3189,8 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, int dirty = le32_to_cpu(m->dirty); int cleaned = 0; bool drop = false; - bool wake_ci = 0; - bool wake_mdsc = 0; + bool wake_ci = false; + bool wake_mdsc = false; list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) { if (cf->tid == flush_tid) From 933ad2c9c8bbb1623c2d3c5753ad340152e15d9d Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 10 Oct 2017 17:06:25 +0800 Subject: [PATCH 06/18] ceph: disable cached readdir after dropping positive dentry Ideally CEPH_CAP_FILE_SHARED should have been revoked before postive dentry get dropped. But if something goes wrong, later cached readdir may dereference the dropped dentry. Signed-off-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 6301bf299b7c..16d8b9dac649 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1186,6 +1186,7 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); + ceph_dir_clear_ordered(dir); d_delete(dn); dput(dn); goto retry_lookup; @@ -1323,6 +1324,7 @@ retry_lookup: dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dn, d_inode(dn), ceph_vinop(d_inode(dn)), ceph_vinop(in)); + ceph_dir_clear_ordered(dir); d_invalidate(dn); have_lease = false; } @@ -1574,6 +1576,7 @@ retry_lookup: ceph_snap(d_inode(dn)) != tvino.snap)) { dout(" dn %p points to wrong inode %p\n", dn, d_inode(dn)); + __ceph_dir_clear_ordered(ci); d_delete(dn); dput(dn); goto retry_lookup; @@ -1598,7 +1601,9 @@ retry_lookup: &req->r_caps_reservation); if (ret < 0) { pr_err("fill_inode badness on %p\n", in); - if (d_really_is_negative(dn)) + if (d_really_is_positive(dn)) + __ceph_dir_clear_ordered(ci); + else iput(in); d_drop(dn); err = ret; From 76bd6ec4988065d39983ba8e93bb738313f68050 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 16 Oct 2017 10:32:50 +0200 Subject: [PATCH 07/18] ceph: -EINVAL on decoding failure in ceph_mdsc_handle_fsmap() Don't set ->mdsmap_err to -ENOENT unconditionally, and drop unneeded return statement while at it. Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 8c8361262ade..b76506be4228 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3879,14 +3879,14 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) goto err_out; } return; + bad: pr_err("error decoding fsmap\n"); err_out: mutex_lock(&mdsc->mutex); - mdsc->mdsmap_err = -ENOENT; + mdsc->mdsmap_err = err; __wake_requests(mdsc, &mdsc->waiting_for_map); mutex_unlock(&mdsc->mutex); - return; } /* From 18370b36b28a6c1b059392e9b8f9a80332e51e7c Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Sun, 15 Oct 2017 12:55:23 -0500 Subject: [PATCH 08/18] ceph: mark expected switch fall-throughs In preparation to enabling -Wimplicit-fallthrough, mark switch cases where we are expecting to fall through. Signed-off-by: Gustavo A. R. Silva [idryomov@gmail.com: amended "Older OSDs" comment] Signed-off-by: Ilya Dryomov --- net/ceph/ceph_hash.c | 12 +++++++++++- net/ceph/messenger.c | 1 + net/ceph/mon_client.c | 5 +++-- 3 files changed, 15 insertions(+), 3 deletions(-) diff --git a/net/ceph/ceph_hash.c b/net/ceph/ceph_hash.c index 67bb1f11e613..9a5850f264ed 100644 --- a/net/ceph/ceph_hash.c +++ b/net/ceph/ceph_hash.c @@ -47,28 +47,38 @@ unsigned int ceph_str_hash_rjenkins(const char *str, unsigned int length) /* handle the last 11 bytes */ c = c + length; - switch (len) { /* all the case statements fall through */ + switch (len) { case 11: c = c + ((__u32)k[10] << 24); + /* fall through */ case 10: c = c + ((__u32)k[9] << 16); + /* fall through */ case 9: c = c + ((__u32)k[8] << 8); /* the first byte of c is reserved for the length */ + /* fall through */ case 8: b = b + ((__u32)k[7] << 24); + /* fall through */ case 7: b = b + ((__u32)k[6] << 16); + /* fall through */ case 6: b = b + ((__u32)k[5] << 8); + /* fall through */ case 5: b = b + k[4]; + /* fall through */ case 4: a = a + ((__u32)k[3] << 24); + /* fall through */ case 3: a = a + ((__u32)k[2] << 16); + /* fall through */ case 2: a = a + ((__u32)k[1] << 8); + /* fall through */ case 1: a = a + k[0]; /* case 0: nothing left to add */ diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index ad93342c90d7..8a4d3758030b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -430,6 +430,7 @@ static void ceph_sock_state_change(struct sock *sk) switch (sk->sk_state) { case TCP_CLOSE: dout("%s TCP_CLOSE\n", __func__); + /* fall through */ case TCP_CLOSE_WAIT: dout("%s TCP_CLOSE_WAIT\n", __func__); con_sock_state_closing(con); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 9ae1bab8c05d..1547107f4854 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1279,9 +1279,10 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con, /* * Older OSDs don't set reply tid even if the orignal - * request had a non-zero tid. Workaround this weirdness - * by falling through to the allocate case. + * request had a non-zero tid. Work around this weirdness + * by allocating a new message. */ + /* fall through */ case CEPH_MSG_MON_MAP: case CEPH_MSG_MDS_MAP: case CEPH_MSG_OSD_MAP: From bb0581f01c38ff525295fc6128bc3a49202dabae Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 18 Oct 2017 12:34:25 +0100 Subject: [PATCH 09/18] ceph: remove unused and redundant variable dropping Variable dropping is set but never read and hence is redundant and can be removed. Cleans up clang warning: fs/ceph/caps.c:1170:2: warning: Value stored to 'dropping' is never read Signed-off-by: Colin Ian King Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 05ae1e472547..a14b2c974c9e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1160,7 +1160,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, struct ceph_inode_info *ci = cap->ci; struct inode *inode = &ci->vfs_inode; struct cap_msg_args arg; - int held, revoking, dropping; + int held, revoking; int wake = 0; int delayed = 0; int ret; @@ -1168,7 +1168,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, held = cap->issued | cap->implemented; revoking = cap->implemented & ~cap->issued; retain &= ~revoking; - dropping = cap->issued & ~retain; dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n", inode, cap, cap->session, From 1de797bb248d2276337139fecaffbd3bbc0f736d Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 12 Oct 2017 12:35:19 +0200 Subject: [PATCH 10/18] rbd: fix and simplify rbd_ioctl_set_ro() ->open_count/-EBUSY check is bogus and wrong: when an open device is set read-only, blkdev_write_iter() refuses further writes with -EPERM. This is standard behaviour and all other block devices allow this. set_disk_ro() call is also problematic: we affect the entire device when called on a single partition. All rbd_ioctl_set_ro() needs to do is refuse ro -> rw transition for mapped snapshots. Everything else can be handled by generic code. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 34 ++++++---------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index adc877dfef5c..fb7cb38a6d83 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -640,46 +640,24 @@ static void rbd_release(struct gendisk *disk, fmode_t mode) static int rbd_ioctl_set_ro(struct rbd_device *rbd_dev, unsigned long arg) { - int ret = 0; - int val; - bool ro; - bool ro_changed = false; + int ro; - /* get_user() may sleep, so call it before taking rbd_dev->lock */ - if (get_user(val, (int __user *)(arg))) + if (get_user(ro, (int __user *)arg)) return -EFAULT; - ro = val ? true : false; - /* Snapshot doesn't allow to write*/ + /* Snapshots can't be marked read-write */ if (rbd_dev->spec->snap_id != CEPH_NOSNAP && !ro) return -EROFS; - spin_lock_irq(&rbd_dev->lock); - /* prevent others open this device */ - if (rbd_dev->open_count > 1) { - ret = -EBUSY; - goto out; - } - - if (rbd_dev->mapping.read_only != ro) { - rbd_dev->mapping.read_only = ro; - ro_changed = true; - } - -out: - spin_unlock_irq(&rbd_dev->lock); - /* set_disk_ro() may sleep, so call it after releasing rbd_dev->lock */ - if (ret == 0 && ro_changed) - set_disk_ro(rbd_dev->disk, ro ? 1 : 0); - - return ret; + /* Let blkdev_roset() handle it */ + return -ENOTTY; } static int rbd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct rbd_device *rbd_dev = bdev->bd_disk->private_data; - int ret = 0; + int ret; switch (cmd) { case BLKROSET: From 9568c93ecab92d3ee60f2f6bec4e4d91641c61a6 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 12 Oct 2017 12:35:19 +0200 Subject: [PATCH 11/18] rbd: get rid of rbd_mapping::read_only It is redundant -- rw/ro state is stored in hd_struct and managed by the block layer. Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 23 ++++------------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fb7cb38a6d83..53b1ced21a13 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -348,7 +348,6 @@ struct rbd_client_id { struct rbd_mapping { u64 size; u64 features; - bool read_only; }; /* @@ -608,9 +607,6 @@ static int rbd_open(struct block_device *bdev, fmode_t mode) struct rbd_device *rbd_dev = bdev->bd_disk->private_data; bool removing = false; - if ((mode & FMODE_WRITE) && rbd_dev->mapping.read_only) - return -EROFS; - spin_lock_irq(&rbd_dev->lock); if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) removing = true; @@ -4028,15 +4024,8 @@ static void rbd_queue_workfn(struct work_struct *work) goto err_rq; } - /* Only reads are allowed to a read-only device */ - - if (op_type != OBJ_OP_READ) { - if (rbd_dev->mapping.read_only) { - result = -EROFS; - goto err_rq; - } - rbd_assert(rbd_dev->spec->snap_id == CEPH_NOSNAP); - } + rbd_assert(op_type == OBJ_OP_READ || + rbd_dev->spec->snap_id == CEPH_NOSNAP); /* * Quit early if the mapped snapshot no longer exists. It's @@ -5972,7 +5961,7 @@ static int rbd_dev_device_setup(struct rbd_device *rbd_dev) goto err_out_disk; set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE); - set_disk_ro(rbd_dev->disk, rbd_dev->mapping.read_only); + set_disk_ro(rbd_dev->disk, rbd_dev->opts->read_only); ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id); if (ret) @@ -6123,7 +6112,6 @@ static ssize_t do_rbd_add(struct bus_type *bus, struct rbd_options *rbd_opts = NULL; struct rbd_spec *spec = NULL; struct rbd_client *rbdc; - bool read_only; int rc; if (!try_module_get(THIS_MODULE)) @@ -6172,11 +6160,8 @@ static ssize_t do_rbd_add(struct bus_type *bus, } /* If we are mapping a snapshot it must be marked read-only */ - - read_only = rbd_dev->opts->read_only; if (rbd_dev->spec->snap_id != CEPH_NOSNAP) - read_only = true; - rbd_dev->mapping.read_only = read_only; + rbd_dev->opts->read_only = true; rc = rbd_dev_device_setup(rbd_dev); if (rc) From c8a96a31cb04c7664626ab6ada7f66c98c09efbd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 19 Oct 2017 08:53:58 -0400 Subject: [PATCH 12/18] ceph: clean up spinlocking and list handling around cleanup_cap_releases() Functions that release a lock taken in a parent frame are notoriously hard to follow. Split cleanup_cap_releases into two functions, one to detach the cap releases from the session (which should be called with the spinlock held), and another to dispose of those caps. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 36 ++++++++++++++++++++++-------------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index b76506be4228..53cde84e698a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1039,22 +1039,23 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* caller holds s_cap_lock, we drop it */ -static void cleanup_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) - __releases(session->s_cap_lock) +static void detach_cap_releases(struct ceph_mds_session *session, + struct list_head *target) { - LIST_HEAD(tmp_list); - list_splice_init(&session->s_cap_releases, &tmp_list); - session->s_num_cap_releases = 0; - spin_unlock(&session->s_cap_lock); + lockdep_assert_held(&session->s_cap_lock); - dout("cleanup_cap_releases mds%d\n", session->s_mds); - while (!list_empty(&tmp_list)) { + list_splice_init(&session->s_cap_releases, target); + session->s_num_cap_releases = 0; + dout("dispose_cap_releases mds%d\n", session->s_mds); +} + +static void dispose_cap_releases(struct ceph_mds_client *mdsc, + struct list_head *dispose) +{ + while (!list_empty(dispose)) { struct ceph_cap *cap; /* zero out the in-progress message */ - cap = list_first_entry(&tmp_list, - struct ceph_cap, session_caps); + cap = list_first_entry(dispose, struct ceph_cap, session_caps); list_del(&cap->session_caps); ceph_put_cap(mdsc, cap); } @@ -1251,6 +1252,8 @@ static void remove_session_caps(struct ceph_mds_session *session) { struct ceph_fs_client *fsc = session->s_mdsc->fsc; struct super_block *sb = fsc->sb; + LIST_HEAD(dispose); + dout("remove_session_caps on %p\n", session); iterate_session_caps(session, remove_session_caps_cb, fsc); @@ -1285,10 +1288,12 @@ static void remove_session_caps(struct ceph_mds_session *session) } // drop cap expires and unlock s_cap_lock - cleanup_cap_releases(session->s_mdsc, session); + detach_cap_releases(session, &dispose); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(session->s_mdsc, &dispose); } /* @@ -3015,6 +3020,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int s_nr_caps; struct ceph_pagelist *pagelist; struct ceph_reconnect_state recon_state; + LIST_HEAD(dispose); pr_info("mds%d reconnect start\n", mds); @@ -3048,7 +3054,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - cleanup_cap_releases(mdsc, session); + detach_cap_releases(session, &dispose); + spin_unlock(&session->s_cap_lock); + dispose_cap_releases(mdsc, &dispose); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) From 080a330e1d9142b9d958a40dcef3ae5ae5d8820a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 23 Oct 2017 10:58:40 -0400 Subject: [PATCH 13/18] ceph: present consistent fsid, regardless of arch endianness Since its inception, ceph has presented the fsid as an opaque value without any sort of endianness conversion. This means that the value presented is different on architectures of different endianness. While the value that should be stuffed into f_fsid is poorly-defined, I think it would be best to strive for consistency here between architectures, and clients (we need to present this properly to the userland client as well). Change ceph_statfs to convert the opaque words to host-endian before doing the xor. On an upgrade, a big-endian box may see a different fsid than it did before, but little-endian arches should see no change with this patch. Signed-off-by: Jeff Layton Reviewed-by: Sage Weil Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e4082afedcb1..fe9fbb3f13f7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -84,8 +84,9 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_ffree = -1; buf->f_namelen = NAME_MAX; - /* leave fsid little-endian, regardless of host endianness */ - fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); + /* Must convert the fsid, for consistent values across arches */ + fsid = le64_to_cpu(*(__le64 *)(&monmap->fsid)) ^ + le64_to_cpu(*((__le64 *)&monmap->fsid + 1)); buf->f_fsid.val[0] = fsid & 0xffffffff; buf->f_fsid.val[1] = fsid >> 32; From 81302565178cfa2d419f5e9743add02997c6b2dc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 30 Oct 2017 10:58:30 -0400 Subject: [PATCH 14/18] ceph: remove the bump of i_version Eventually, we'll want to wire cephfs up to use the change attribute that the cluster tracks instead, but for now this is unneeded. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 16d8b9dac649..ab81652198c4 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -787,7 +787,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page, /* update inode */ ci->i_version = le64_to_cpu(info->version); - inode->i_version++; inode->i_rdev = le32_to_cpu(info->rdev); inode->i_blkbits = fls(le32_to_cpu(info->layout.fl_stripe_unit)) - 1; From ec1dff25b0a012711e10290039fbc8e1c5dd69fb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 31 Oct 2017 15:51:14 -0400 Subject: [PATCH 15/18] ceph: silence sparse endianness warning in encode_caps_cb sparse warns: fs/ceph/mds_client.c:2887:34: warning: incorrect type in assignment (different base types) fs/ceph/mds_client.c:2887:34: expected restricted __le32 [assigned] [usertype] flock_len fs/ceph/mds_client.c:2887:34: got int At this point, it's just being used as a flag. It gets overwritten later if the rest of the encoding succeeds. Signed-off-by: Jeff Layton Reviewed-by: "Yan, Zheng" Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 53cde84e698a..ab69dcb70e8a 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -2885,8 +2885,8 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); rec.v2.pathbase = cpu_to_le64(pathbase); - rec.v2.flock_len = - (ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1; + rec.v2.flock_len = (__force __le32) + ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); From 7c084289795bc0f3b9ab315ac3c8d269dd4d0215 Mon Sep 17 00:00:00 2001 From: David Disseldorp Date: Thu, 2 Nov 2017 01:05:11 +0100 Subject: [PATCH 16/18] rbd: set discard_alignment to zero RBD devices are currently incorrectly initialised with the block queue discard_alignment set to the underlying RADOS object size. As per Documentation/ABI/testing/sysfs-block: The discard_alignment parameter indicates how many bytes the beginning of the device is offset from the internal allocation unit's natural alignment. Correcting the discard_alignment parameter from the RADOS object size to zero (the blk_set_default_limits() default) has no effect on how discard requests are propagated through the block layer - @alignment in __blkdev_issue_discard() remains zero. However, it does fix the UNMAP granularity alignment value advertised to SCSI initiators via the Block Limits VPD. Signed-off-by: David Disseldorp Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 53b1ced21a13..8c132a7fbd2c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4390,7 +4390,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) /* enable the discard support */ queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); q->limits.discard_granularity = segment_size; - q->limits.discard_alignment = segment_size; blk_queue_max_discard_sectors(q, segment_size / SECTOR_SIZE); blk_queue_max_write_zeroes_sectors(q, segment_size / SECTOR_SIZE); From b11270853fa3654f08d4a6a03b23ddb220512d8d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 6 Nov 2017 21:57:26 -0800 Subject: [PATCH 17/18] libceph: don't WARN() if user tries to add invalid key The WARN_ON(!key->len) in set_secret() in net/ceph/crypto.c is hit if a user tries to add a key of type "ceph" with an invalid payload as follows (assuming CONFIG_CEPH_LIB=y): echo -e -n '\x01\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00' \ | keyctl padd ceph desc @s This can be hit by fuzzers. As this is merely bad input and not a kernel bug, replace the WARN_ON() with return -EINVAL. Fixes: 7af3ea189a9a ("libceph: stop allocating a new cipher on every crypto request") Cc: # v4.10+ Signed-off-by: Eric Biggers Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- net/ceph/crypto.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 489610ac1cdd..bf9d079cbafd 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -37,7 +37,9 @@ static int set_secret(struct ceph_crypto_key *key, void *buf) return -ENOTSUPP; } - WARN_ON(!key->len); + if (!key->len) + return -EINVAL; + key->key = kmemdup(buf, key->len, GFP_NOIO); if (!key->key) { ret = -ENOMEM; From 3cfa3b16dd2f1787f9d19d6da2fe9652d806b387 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 13 Nov 2017 10:35:40 +0100 Subject: [PATCH 18/18] rbd: default to single-major device number scheme It's been 3.5 years, let's turn it on by default. Support in rbd(8) utility goes back to pre-firefly, "rbd map" has been loading the module with single_major=Y ever since. However, if the module is already loaded (whether by hand or at boot time), we end up with single_major=N. Also, some people don't install rbd(8) and use the sysfs interface directly. (With single-major=N, a major number is consumed for every mapping, imposing a limit of ~240 rbd images per host. single-major=Y allows mapping thousands of rbd images on a single machine.) Signed-off-by: Ilya Dryomov Reviewed-by: Jason Dillaman --- drivers/block/rbd.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8c132a7fbd2c..38fc5f397fde 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -449,12 +449,11 @@ static DEFINE_IDA(rbd_dev_id_ida); static struct workqueue_struct *rbd_wq; /* - * Default to false for now, as single-major requires >= 0.75 version of - * userspace rbd utility. + * single-major requires >= 0.75 version of userspace rbd utility. */ -static bool single_major = false; +static bool single_major = true; module_param(single_major, bool, S_IRUGO); -MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: false)"); +MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)"); static int rbd_img_request_submit(struct rbd_img_request *img_request);