From a33f6432b3a63a4909dbbb0967f7c9df8ff2de91 Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Tue, 11 Aug 2020 15:23:03 +0800 Subject: [PATCH 01/27] ceph: encode inodes' parent/d_name in cap reconnect message Since nautilus, MDS tracks dirfrags whose child inodes have caps in open file table. When MDS recovers, it prefetches all of these dirfrags. This avoids using backtrace to load inodes. But dirfrags prefetch may load lots of useless inodes into cache, and make MDS run out of memory. Recent MDS adds an option that disables dirfrags prefetch. When dirfrags prefetch is disabled. Recovering MDS only prefetches corresponding dir inodes. Including inodes' parent/d_name in cap reconnect message can help MDS to load inodes into its cache. Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 89 ++++++++++++++++++++++++++++++-------------- 1 file changed, 61 insertions(+), 28 deletions(-) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 4a26862d7667..76d8d9495d1d 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3612,6 +3612,39 @@ fail_msg: return err; } +static struct dentry* d_find_primary(struct inode *inode) +{ + struct dentry *alias, *dn = NULL; + + if (hlist_empty(&inode->i_dentry)) + return NULL; + + spin_lock(&inode->i_lock); + if (hlist_empty(&inode->i_dentry)) + goto out_unlock; + + if (S_ISDIR(inode->i_mode)) { + alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); + if (!IS_ROOT(alias)) + dn = dget(alias); + goto out_unlock; + } + + hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { + spin_lock(&alias->d_lock); + if (!d_unhashed(alias) && + (ceph_dentry(alias)->flags & CEPH_DENTRY_PRIMARY_LINK)) { + dn = dget_dlock(alias); + } + spin_unlock(&alias->d_lock); + if (dn) + break; + } +out_unlock: + spin_unlock(&inode->i_lock); + return dn; +} + /* * Encode information about a cap for a reconnect with the MDS. */ @@ -3625,13 +3658,32 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, struct ceph_inode_info *ci = cap->ci; struct ceph_reconnect_state *recon_state = arg; struct ceph_pagelist *pagelist = recon_state->pagelist; - int err; + struct dentry *dentry; + char *path; + int pathlen, err; + u64 pathbase; u64 snap_follows; dout(" adding %p ino %llx.%llx cap %p %lld %s\n", inode, ceph_vinop(inode), cap, cap->cap_id, ceph_cap_string(cap->issued)); + dentry = d_find_primary(inode); + if (dentry) { + /* set pathbase to parent dir when msg_version >= 2 */ + path = ceph_mdsc_build_path(dentry, &pathlen, &pathbase, + recon_state->msg_version >= 2); + dput(dentry); + if (IS_ERR(path)) { + err = PTR_ERR(path); + goto out_err; + } + } else { + path = NULL; + pathlen = 0; + pathbase = 0; + } + spin_lock(&ci->i_ceph_lock); cap->seq = 0; /* reset cap seq */ cap->issue_seq = 0; /* and issue_seq */ @@ -3652,7 +3704,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v2.issued = cpu_to_le32(cap->issued); rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v2.pathbase = 0; + rec.v2.pathbase = cpu_to_le64(pathbase); rec.v2.flock_len = (__force __le32) ((ci->i_ceph_flags & CEPH_I_ERROR_FILELOCK) ? 0 : 1); } else { @@ -3663,7 +3715,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); - rec.v1.pathbase = 0; + rec.v1.pathbase = cpu_to_le64(pathbase); } if (list_empty(&ci->i_cap_snaps)) { @@ -3725,7 +3777,7 @@ encode_again: sizeof(struct ceph_filelock); rec.v2.flock_len = cpu_to_le32(struct_len); - struct_len += sizeof(u32) + sizeof(rec.v2); + struct_len += sizeof(u32) + pathlen + sizeof(rec.v2); if (struct_v >= 2) struct_len += sizeof(u64); /* snap_follows */ @@ -3749,7 +3801,7 @@ encode_again: ceph_pagelist_encode_8(pagelist, 1); ceph_pagelist_encode_32(pagelist, struct_len); } - ceph_pagelist_encode_string(pagelist, NULL, 0); + ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2)); ceph_locks_to_pagelist(flocks, pagelist, num_fcntl_locks, num_flock_locks); @@ -3758,39 +3810,20 @@ encode_again: out_freeflocks: kfree(flocks); } else { - u64 pathbase = 0; - int pathlen = 0; - char *path = NULL; - struct dentry *dentry; - - dentry = d_find_alias(inode); - if (dentry) { - path = ceph_mdsc_build_path(dentry, - &pathlen, &pathbase, 0); - dput(dentry); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out_err; - } - rec.v1.pathbase = cpu_to_le64(pathbase); - } - err = ceph_pagelist_reserve(pagelist, sizeof(u64) + sizeof(u32) + pathlen + sizeof(rec.v1)); - if (err) { - goto out_freepath; - } + if (err) + goto out_err; ceph_pagelist_encode_64(pagelist, ceph_ino(inode)); ceph_pagelist_encode_string(pagelist, path, pathlen); ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1)); -out_freepath: - ceph_mdsc_free_path(path, pathlen); } out_err: - if (err >= 0) + ceph_mdsc_free_path(path, pathlen); + if (!err) recon_state->nr_caps++; return err; } From 1c30c90733879ea197dd29af54450a0f6cdcacb1 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Fri, 14 Aug 2020 10:38:22 +0100 Subject: [PATCH 02/27] ceph: remove unnecessary return in switch statement Since there's a return immediately after the 'break', there's no need for this extra 'return' in the S_IFDIR case. Signed-off-by: Luis Henriques Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3f4c993dfc6f..fb3ea715a19d 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -256,8 +256,6 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFDIR: ret = ceph_init_file_info(inode, file, fmode, S_ISDIR(inode->i_mode)); - if (ret) - return ret; break; case S_IFLNK: From 3986f9a42e993075af01c17dc8968cfb96a4fe53 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 17 Aug 2020 13:45:04 +0200 Subject: [PATCH 03/27] libceph: multiple workspaces for CRUSH computations Replace a global map->crush_workspace (protected by a global mutex) with a list of workspaces, up to the number of CPUs + 1. This is based on a patch from Robin Geuze . Robin and his team have observed a 10-20% increase in IOPS on all queue depths and lower CPU usage as well on a high-end all-NVMe 100GbE cluster. Signed-off-by: Ilya Dryomov --- include/linux/ceph/osdmap.h | 14 ++- include/linux/crush/crush.h | 3 + net/ceph/osdmap.c | 166 ++++++++++++++++++++++++++++++++---- 3 files changed, 166 insertions(+), 17 deletions(-) diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h index 3f4498fef6ad..cad9acfbc320 100644 --- a/include/linux/ceph/osdmap.h +++ b/include/linux/ceph/osdmap.h @@ -137,6 +137,17 @@ int ceph_oid_aprintf(struct ceph_object_id *oid, gfp_t gfp, const char *fmt, ...); void ceph_oid_destroy(struct ceph_object_id *oid); +struct workspace_manager { + struct list_head idle_ws; + spinlock_t ws_lock; + /* Number of free workspaces */ + int free_ws; + /* Total number of allocated workspaces */ + atomic_t total_ws; + /* Waiters for a free workspace */ + wait_queue_head_t ws_wait; +}; + struct ceph_pg_mapping { struct rb_node node; struct ceph_pg pgid; @@ -184,8 +195,7 @@ struct ceph_osdmap { * the list of osds that store+replicate them. */ struct crush_map *crush; - struct mutex crush_workspace_mutex; - void *crush_workspace; + struct workspace_manager crush_wsm; }; static inline bool ceph_osd_exists(struct ceph_osdmap *map, int osd) diff --git a/include/linux/crush/crush.h b/include/linux/crush/crush.h index 2f811baf78d2..30dba392b730 100644 --- a/include/linux/crush/crush.h +++ b/include/linux/crush/crush.h @@ -346,6 +346,9 @@ struct crush_work_bucket { struct crush_work { struct crush_work_bucket **work; /* Per-bucket working store */ +#ifdef __KERNEL__ + struct list_head item; +#endif }; #ifdef __KERNEL__ diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 96c25f5e064a..fa08c15be0c0 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -964,6 +964,143 @@ bad: return -EINVAL; } +/* + * CRUSH workspaces + * + * workspace_manager framework borrowed from fs/btrfs/compression.c. + * Two simplifications: there is only one type of workspace and there + * is always at least one workspace. + */ +static struct crush_work *alloc_workspace(const struct crush_map *c) +{ + struct crush_work *work; + size_t work_size; + + WARN_ON(!c->working_size); + work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); + dout("%s work_size %zu bytes\n", __func__, work_size); + + work = ceph_kvmalloc(work_size, GFP_NOIO); + if (!work) + return NULL; + + INIT_LIST_HEAD(&work->item); + crush_init_workspace(c, work); + return work; +} + +static void free_workspace(struct crush_work *work) +{ + WARN_ON(!list_empty(&work->item)); + kvfree(work); +} + +static void init_workspace_manager(struct workspace_manager *wsm) +{ + INIT_LIST_HEAD(&wsm->idle_ws); + spin_lock_init(&wsm->ws_lock); + atomic_set(&wsm->total_ws, 0); + wsm->free_ws = 0; + init_waitqueue_head(&wsm->ws_wait); +} + +static void add_initial_workspace(struct workspace_manager *wsm, + struct crush_work *work) +{ + WARN_ON(!list_empty(&wsm->idle_ws)); + + list_add(&work->item, &wsm->idle_ws); + atomic_set(&wsm->total_ws, 1); + wsm->free_ws = 1; +} + +static void cleanup_workspace_manager(struct workspace_manager *wsm) +{ + struct crush_work *work; + + while (!list_empty(&wsm->idle_ws)) { + work = list_first_entry(&wsm->idle_ws, struct crush_work, + item); + list_del_init(&work->item); + free_workspace(work); + } + atomic_set(&wsm->total_ws, 0); + wsm->free_ws = 0; +} + +/* + * Finds an available workspace or allocates a new one. If it's not + * possible to allocate a new one, waits until there is one. + */ +static struct crush_work *get_workspace(struct workspace_manager *wsm, + const struct crush_map *c) +{ + struct crush_work *work; + int cpus = num_online_cpus(); + +again: + spin_lock(&wsm->ws_lock); + if (!list_empty(&wsm->idle_ws)) { + work = list_first_entry(&wsm->idle_ws, struct crush_work, + item); + list_del_init(&work->item); + wsm->free_ws--; + spin_unlock(&wsm->ws_lock); + return work; + + } + if (atomic_read(&wsm->total_ws) > cpus) { + DEFINE_WAIT(wait); + + spin_unlock(&wsm->ws_lock); + prepare_to_wait(&wsm->ws_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&wsm->total_ws) > cpus && !wsm->free_ws) + schedule(); + finish_wait(&wsm->ws_wait, &wait); + goto again; + } + atomic_inc(&wsm->total_ws); + spin_unlock(&wsm->ws_lock); + + work = alloc_workspace(c); + if (!work) { + atomic_dec(&wsm->total_ws); + wake_up(&wsm->ws_wait); + + /* + * Do not return the error but go back to waiting. We + * have the inital workspace and the CRUSH computation + * time is bounded so we will get it eventually. + */ + WARN_ON(atomic_read(&wsm->total_ws) < 1); + goto again; + } + return work; +} + +/* + * Puts a workspace back on the list or frees it if we have enough + * idle ones sitting around. + */ +static void put_workspace(struct workspace_manager *wsm, + struct crush_work *work) +{ + spin_lock(&wsm->ws_lock); + if (wsm->free_ws <= num_online_cpus()) { + list_add(&work->item, &wsm->idle_ws); + wsm->free_ws++; + spin_unlock(&wsm->ws_lock); + goto wake; + } + spin_unlock(&wsm->ws_lock); + + free_workspace(work); + atomic_dec(&wsm->total_ws); +wake: + if (wq_has_sleeper(&wsm->ws_wait)) + wake_up(&wsm->ws_wait); +} + /* * osd map */ @@ -981,7 +1118,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) map->primary_temp = RB_ROOT; map->pg_upmap = RB_ROOT; map->pg_upmap_items = RB_ROOT; - mutex_init(&map->crush_workspace_mutex); + + init_workspace_manager(&map->crush_wsm); return map; } @@ -989,8 +1127,11 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) void ceph_osdmap_destroy(struct ceph_osdmap *map) { dout("osdmap_destroy %p\n", map); + if (map->crush) crush_destroy(map->crush); + cleanup_workspace_manager(&map->crush_wsm); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { struct ceph_pg_mapping *pg = rb_entry(rb_first(&map->pg_temp), @@ -1029,7 +1170,6 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) kvfree(map->osd_weight); kvfree(map->osd_addr); kvfree(map->osd_primary_affinity); - kvfree(map->crush_workspace); kfree(map); } @@ -1104,26 +1244,22 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) static int osdmap_set_crush(struct ceph_osdmap *map, struct crush_map *crush) { - void *workspace; - size_t work_size; + struct crush_work *work; if (IS_ERR(crush)) return PTR_ERR(crush); - work_size = crush_work_size(crush, CEPH_PG_MAX_SIZE); - dout("%s work_size %zu bytes\n", __func__, work_size); - workspace = ceph_kvmalloc(work_size, GFP_NOIO); - if (!workspace) { + work = alloc_workspace(crush); + if (!work) { crush_destroy(crush); return -ENOMEM; } - crush_init_workspace(crush, workspace); if (map->crush) crush_destroy(map->crush); - kvfree(map->crush_workspace); + cleanup_workspace_manager(&map->crush_wsm); map->crush = crush; - map->crush_workspace = workspace; + add_initial_workspace(&map->crush_wsm, work); return 0; } @@ -2322,6 +2458,7 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, s64 choose_args_index) { struct crush_choose_arg_map *arg_map; + struct crush_work *work; int r; BUG_ON(result_max > CEPH_PG_MAX_SIZE); @@ -2332,12 +2469,11 @@ static int do_crush(struct ceph_osdmap *map, int ruleno, int x, arg_map = lookup_choose_arg_map(&map->crush->choose_args, CEPH_DEFAULT_CHOOSE_ARGS); - mutex_lock(&map->crush_workspace_mutex); + work = get_workspace(&map->crush_wsm, map->crush); r = crush_do_rule(map->crush, ruleno, x, result, result_max, - weight, weight_max, map->crush_workspace, + weight, weight_max, work, arg_map ? arg_map->args : NULL); - mutex_unlock(&map->crush_workspace_mutex); - + put_workspace(&map->crush_wsm, work); return r; } From 3a8ebe0b8b616c5f6d72f9a95aa29ccd0b35408f Mon Sep 17 00:00:00 2001 From: Yanhu Cao Date: Mon, 24 Aug 2020 11:00:58 +0800 Subject: [PATCH 04/27] ceph: add column 'mds' to show caps in more user friendly In multi-mds, the 'caps' debugfs file will have duplicate ino, add the 'mds' column to indicate which mds session the cap belongs to. Signed-off-by: Yanhu Cao Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/debugfs.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 3e3fcda9b276..75b13175c530 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -202,7 +202,8 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p) { struct seq_file *s = p; - seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode), + seq_printf(s, "0x%-17llx%-3d%-17s%-17s\n", ceph_ino(inode), + cap->session->s_mds, ceph_cap_string(cap->issued), ceph_cap_string(cap->implemented)); return 0; @@ -222,8 +223,8 @@ static int caps_show(struct seq_file *s, void *p) "reserved\t%d\n" "min\t\t%d\n\n", total, avail, used, reserved, min); - seq_printf(s, "ino issued implemented\n"); - seq_printf(s, "-----------------------------------------------\n"); + seq_printf(s, "ino mds issued implemented\n"); + seq_printf(s, "--------------------------------------------------\n"); mutex_lock(&mdsc->mutex); for (i = 0; i < mdsc->max_sessions; i++) { From c5f575ed08c38d077a581a1ec0c48c23ee6b7c21 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 21 Aug 2020 21:20:59 -0700 Subject: [PATCH 05/27] ceph: drop special-casing for ITER_PIPE in ceph_sync_read This special casing was added in 7ce469a53e71 (ceph: fix splice read for no Fc capability case). The confirm callback for ITER_PIPE expects that the page is Uptodate and returns an error otherwise. A simpler workaround is just to use the Uptodate bit, which has no meaning for anonymous pages. Rip out the special casing for ITER_PIPE and just SetPageUptodate before we copy to the iter. Cc: John Hubbard Suggested-by: Al Viro Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 71 +++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 47 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index fb3ea715a19d..ed8fbfe3bddc 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -863,6 +863,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, size_t page_off; u64 i_size; bool more; + int idx; + size_t left; req = ceph_osdc_new_request(osdc, &ci->i_layout, ci->i_vino, off, &len, 0, 1, @@ -876,29 +878,13 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, more = len < iov_iter_count(to); - if (unlikely(iov_iter_is_pipe(to))) { - ret = iov_iter_get_pages_alloc(to, &pages, len, - &page_off); - if (ret <= 0) { - ceph_osdc_put_request(req); - ret = -ENOMEM; - break; - } - num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE); - if (ret < len) { - len = ret; - osd_req_op_extent_update(req, 0, len); - more = false; - } - } else { - num_pages = calc_pages_for(off, len); - page_off = off & ~PAGE_MASK; - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) { - ceph_osdc_put_request(req); - ret = PTR_ERR(pages); - break; - } + num_pages = calc_pages_for(off, len); + page_off = off & ~PAGE_MASK; + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) { + ceph_osdc_put_request(req); + ret = PTR_ERR(pages); + break; } osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_off, @@ -929,32 +915,23 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ret += zlen; } - if (unlikely(iov_iter_is_pipe(to))) { - if (ret > 0) { - iov_iter_advance(to, ret); - off += ret; - } else { - iov_iter_advance(to, 0); + idx = 0; + left = ret > 0 ? ret : 0; + while (left > 0) { + size_t len, copied; + page_off = off & ~PAGE_MASK; + len = min_t(size_t, left, PAGE_SIZE - page_off); + SetPageUptodate(pages[idx]); + copied = copy_page_to_iter(pages[idx++], + page_off, len, to); + off += copied; + left -= copied; + if (copied < len) { + ret = -EFAULT; + break; } - ceph_put_page_vector(pages, num_pages, false); - } else { - int idx = 0; - size_t left = ret > 0 ? ret : 0; - while (left > 0) { - size_t len, copied; - page_off = off & ~PAGE_MASK; - len = min_t(size_t, left, PAGE_SIZE - page_off); - copied = copy_page_to_iter(pages[idx++], - page_off, len, to); - off += copied; - left -= copied; - if (copied < len) { - ret = -EFAULT; - break; - } - } - ceph_release_page_vector(pages, num_pages); } + ceph_release_page_vector(pages, num_pages); if (ret < 0) { if (ret == -EBLACKLISTED) From 2678da88f4b449300d56e0e7a9e77d1a79c83463 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 3 Sep 2020 09:01:39 -0400 Subject: [PATCH 06/27] ceph: add ceph_sb_to_mdsc helper support to parse the mdsc This will help simplify the code. [ jlayton: fix minor merge conflict in quota.c ] Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 +-- fs/ceph/dir.c | 20 +++++++------------- fs/ceph/file.c | 8 +++----- fs/ceph/inode.c | 5 ++--- fs/ceph/locks.c | 2 +- fs/ceph/quota.c | 10 +++++----- fs/ceph/snap.c | 2 +- fs/ceph/super.h | 6 ++++++ 8 files changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 034b3f4fdd3a..39753e0e0e5a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1906,9 +1906,8 @@ bool __ceph_should_report_size(struct ceph_inode_info *ci) void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_session *session) { - struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); - struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_cap *cap; u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index d72e4a12bb69..a4d48370b2b3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,8 +38,7 @@ static int __dir_lease_try_check(const struct dentry *dentry); static int ceph_d_init(struct dentry *dentry) { struct ceph_dentry_info *di; - struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dentry->d_sb); di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL); if (!di) @@ -738,7 +737,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; int op; int mask; @@ -827,8 +826,7 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry) static int ceph_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err; @@ -889,8 +887,7 @@ static int ceph_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int ceph_symlink(struct inode *dir, struct dentry *dentry, const char *dest) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err; @@ -942,8 +939,7 @@ out: static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; int err = -EROFS; @@ -1010,8 +1006,7 @@ out: static int ceph_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { - struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; int err; @@ -1192,8 +1187,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - struct ceph_fs_client *fsc = ceph_sb_to_client(old_dir->i_sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old_dir->i_sb); struct ceph_mds_request *req; int op = CEPH_MDS_OP_RENAME; int err; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index ed8fbfe3bddc..762a280b7037 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -182,8 +182,7 @@ static void put_bvecs(struct bio_vec *bvecs, int num_bvecs, bool should_dirty) static struct ceph_mds_request * prepare_open_request(struct super_block *sb, int flags, int create_mode) { - struct ceph_fs_client *fsc = ceph_sb_to_client(sb); - struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); struct ceph_mds_request *req; int want_auth = USE_ANY_MDS; int op = (flags & O_CREAT) ? CEPH_MDS_OP_CREATE : CEPH_MDS_OP_OPEN; @@ -283,7 +282,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) */ int ceph_renew_caps(struct inode *inode, int fmode) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; int err, flags, wanted; @@ -1027,8 +1026,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) struct inode *inode = req->r_inode; struct ceph_aio_request *aio_req = req->r_priv; struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_client_metric *metric = &fsc->mdsc->metric; + struct ceph_client_metric *metric = &ceph_sb_to_mdsc(inode->i_sb)->metric; BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_BVECS); BUG_ON(!osd_data->num_bvecs); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index d163fa96cb40..1fed0e827eb7 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -558,8 +558,7 @@ void ceph_evict_inode(struct inode *inode) * caps in i_snap_caps. */ if (ci->i_snap_realm) { - struct ceph_mds_client *mdsc = - ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); if (ceph_snap(inode) == CEPH_NOSNAP) { struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", @@ -739,7 +738,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, struct ceph_mds_session *session, int cap_fmode, struct ceph_cap_reservation *caps_reservation) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_mds_reply_inode *info = iinfo->in; struct ceph_inode_info *ci = ceph_inode(inode); int issued, new_issued, info_caps; diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d6b9166e71e4..048a435a29be 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -63,7 +63,7 @@ static const struct file_lock_operations ceph_fl_lock_ops = { static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode, int cmd, u8 wait, struct file_lock *fl) { - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_mds_request *req; int err; u64 length = 0; diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index cc2c4d40b022..83cb4f26b689 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -12,7 +12,7 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); if (inc) atomic64_inc(&mdsc->quotarealms_count); else @@ -21,8 +21,8 @@ void ceph_adjust_quota_realms_count(struct inode *inode, bool inc) static inline bool ceph_has_realms_with_quotas(struct inode *inode) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; - struct super_block *sb = mdsc->fsc->sb; + struct super_block *sb = inode->i_sb; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(sb); struct inode *root = d_inode(sb->s_root); if (atomic64_read(&mdsc->quotarealms_count) > 0) @@ -266,7 +266,7 @@ restart: static bool ceph_quota_is_same_realm(struct inode *old, struct inode *new) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(old)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(old->i_sb); struct ceph_snap_realm *old_realm, *new_realm; bool is_same; @@ -313,7 +313,7 @@ enum quota_check_op { static bool check_quota_exceeded(struct inode *inode, enum quota_check_op op, loff_t delta) { - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; struct ceph_snap_realm *realm, *next; struct inode *in; diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 923be9399b21..0da39c16dab4 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -602,7 +602,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap) { struct inode *inode = &ci->vfs_inode; - struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); capsnap->size = inode->i_size; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index a3995ebe0623..483a52d281cd 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -451,6 +451,12 @@ ceph_sb_to_client(const struct super_block *sb) return (struct ceph_fs_client *)sb->s_fs_info; } +static inline struct ceph_mds_client * +ceph_sb_to_mdsc(const struct super_block *sb) +{ + return (struct ceph_mds_client *)ceph_sb_to_client(sb)->mdsc; +} + static inline struct ceph_vino ceph_vino(const struct inode *inode) { From 1dd8d470813699baab9112e95fce00979b21c4f7 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 3 Sep 2020 09:01:40 -0400 Subject: [PATCH 07/27] ceph: metrics for opened files, pinned caps and opened inodes In client for each inode, it may have many opened files and may have been pinned in more than one MDS servers. And some inodes are idle, which have no any opened files. This patch will show these metrics in the debugfs, likes: item total ----------------------------------------- opened files / total inodes 14 / 5 pinned i_caps / total inodes 7 / 5 opened inodes / total inodes 3 / 5 Will send these metrics to ceph, which will be used by the `fs top`, later. [ jlayton: drop unrelated hunk, count hashed inodes instead of allocated ones ] URL: https://tracker.ceph.com/issues/47005 Signed-off-by: Xiubo Li Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 38 ++++++++++++++++++++++++++++++++++++-- fs/ceph/debugfs.c | 11 +++++++++++ fs/ceph/inode.c | 7 ++++++- fs/ceph/metric.c | 14 ++++++++++++++ fs/ceph/metric.h | 7 +++++++ 5 files changed, 74 insertions(+), 3 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 39753e0e0e5a..c7e69547628e 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -4283,13 +4283,30 @@ void __ceph_touch_fmode(struct ceph_inode_info *ci, void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) { - int i; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); int bits = (fmode << 1) | 1; + bool is_opened = false; + int i; + + if (count == 1) + atomic64_inc(&mdsc->metric.opened_files); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) ci->i_nr_by_mode[i] += count; + + /* + * If any of the mode ref is larger than 1, + * that means it has been already opened by + * others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i] > 1) + is_opened = true; } + + if (!is_opened) + percpu_counter_inc(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } @@ -4300,15 +4317,32 @@ void ceph_get_fmode(struct ceph_inode_info *ci, int fmode, int count) */ void ceph_put_fmode(struct ceph_inode_info *ci, int fmode, int count) { - int i; + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(ci->vfs_inode.i_sb); int bits = (fmode << 1) | 1; + bool is_closed = true; + int i; + + if (count == 1) + atomic64_dec(&mdsc->metric.opened_files); + spin_lock(&ci->i_ceph_lock); for (i = 0; i < CEPH_FILE_MODE_BITS; i++) { if (bits & (1 << i)) { BUG_ON(ci->i_nr_by_mode[i] < count); ci->i_nr_by_mode[i] -= count; } + + /* + * If any of the mode ref is not 0 after + * decreased, that means it is still opened + * by others. Just skip checking the PIN ref. + */ + if (i && ci->i_nr_by_mode[i]) + is_closed = false; } + + if (is_closed) + percpu_counter_dec(&mdsc->metric.opened_inodes); spin_unlock(&ci->i_ceph_lock); } diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 75b13175c530..7a8fbe3e4751 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -148,6 +148,17 @@ static int metric_show(struct seq_file *s, void *p) int nr_caps = 0; s64 total, sum, avg, min, max, sq; + sum = percpu_counter_sum(&m->total_inodes); + seq_printf(s, "item total\n"); + seq_printf(s, "------------------------------------------\n"); + seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes", + atomic64_read(&m->opened_files), sum); + seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes", + atomic64_read(&m->total_caps), sum); + seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes", + percpu_counter_sum(&m->opened_inodes), sum); + + seq_printf(s, "\n"); seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1fed0e827eb7..526faf4778ce 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -42,10 +42,13 @@ static void ceph_inode_work(struct work_struct *work); static int ceph_set_ino_cb(struct inode *inode, void *data) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); ci->i_vino = *(struct ceph_vino *)data; inode->i_ino = ceph_vino_to_ino_t(ci->i_vino); inode_set_iversion_raw(inode, 0); + percpu_counter_inc(&mdsc->metric.total_inodes); + return 0; } @@ -538,11 +541,14 @@ void ceph_free_inode(struct inode *inode) void ceph_evict_inode(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_frag *frag; struct rb_node *n; dout("evict_inode %p ino %llx.%llx\n", inode, ceph_vinop(inode)); + percpu_counter_dec(&mdsc->metric.total_inodes); + truncate_inode_pages_final(&inode->i_data); clear_inode(inode); @@ -558,7 +564,6 @@ void ceph_evict_inode(struct inode *inode) * caps in i_snap_caps. */ if (ci->i_snap_realm) { - struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); if (ceph_snap(inode) == CEPH_NOSNAP) { struct ceph_snap_realm *realm = ci->i_snap_realm; dout(" dropping residual ref to snap realm %p\n", diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 2466b261fba2..fee4c4778313 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -192,11 +192,23 @@ int ceph_metric_init(struct ceph_client_metric *m) m->total_metadatas = 0; m->metadata_latency_sum = 0; + atomic64_set(&m->opened_files, 0); + ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); + if (ret) + goto err_opened_inodes; + ret = percpu_counter_init(&m->total_inodes, 0, GFP_KERNEL); + if (ret) + goto err_total_inodes; + m->session = NULL; INIT_DELAYED_WORK(&m->delayed_work, metric_delayed_work); return 0; +err_total_inodes: + percpu_counter_destroy(&m->opened_inodes); +err_opened_inodes: + percpu_counter_destroy(&m->i_caps_mis); err_i_caps_mis: percpu_counter_destroy(&m->i_caps_hit); err_i_caps_hit: @@ -212,6 +224,8 @@ void ceph_metric_destroy(struct ceph_client_metric *m) if (!m) return; + percpu_counter_destroy(&m->total_inodes); + percpu_counter_destroy(&m->opened_inodes); percpu_counter_destroy(&m->i_caps_mis); percpu_counter_destroy(&m->i_caps_hit); percpu_counter_destroy(&m->d_lease_mis); diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 1d0959d669d7..710f3f1dceab 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -115,6 +115,13 @@ struct ceph_client_metric { ktime_t metadata_latency_min; ktime_t metadata_latency_max; + /* The total number of directories and files that are opened */ + atomic64_t opened_files; + + /* The total number of inodes that have opened files or directories */ + struct percpu_counter opened_inodes; + struct percpu_counter total_inodes; + struct ceph_mds_session *session; struct delayed_work delayed_work; /* delayed work */ }; From 470a5c77eac0e07bfe60413fb3d314b734392bc3 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 11 Sep 2020 15:19:00 -0400 Subject: [PATCH 08/27] ceph: use kill_anon_super helper ceph open-codes this around some other activity and the rationale for it isn't clear. There is no need to delay free_anon_bdev until the end of kill_sb. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 7ec0e6d03d10..b3fc9bb61afc 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1205,14 +1205,13 @@ nomem: static void ceph_kill_sb(struct super_block *s) { struct ceph_fs_client *fsc = ceph_sb_to_client(s); - dev_t dev = s->s_dev; dout("kill_sb %p\n", s); ceph_mdsc_pre_umount(fsc->mdsc); flush_fs_workqueues(fsc); - generic_shutdown_super(s); + kill_anon_super(s); fsc->client->extra_mon_dispatch = NULL; ceph_fs_debugfs_cleanup(fsc); @@ -1220,7 +1219,6 @@ static void ceph_kill_sb(struct super_block *s) ceph_fscache_unregister_fs(fsc); destroy_fs_client(fsc); - free_anon_bdev(dev); } static struct file_system_type ceph_fs_type = { From 2e169296603470d209d9feecbfb67b9c4cb5ca0f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 14 Sep 2020 13:30:36 -0400 Subject: [PATCH 09/27] ceph: have ceph_writepages_start call pagevec_lookup_range_tag Currently it calls pagevec_lookup_range_nr_tag(), but that may be inefficient, as we might end up having to search several times as we get down to looking for fewer pages to fill the array. Thus spake Willy: "I think ceph is misusing pagevec_lookup_range_nr_tag(). Let's suppose you get a range which is AAAAbbbbAAAAbbbbAAAAbbbbbbbb(...)bbbbAAAA and you try to fetch max_pages=13. First loop will get AAAAbbbbAAAAb and have 8 locked_pages. The next call will get bbbAA and now locked_pages=10. Next call gets AAb ... and now you're iterating your way through all the 'b' one page at a time until you find that first A." 'A' here refers to pages that are eligible for writeback and 'b' represents ones that aren't (for whatever reason). Not capping the number of return pages may mean that we sometimes find more pages than are needed, but the extra references will just get put at the end. Ceph is also the only caller of pagevec_lookup_range_nr_tag(), so this change should allow us to eliminate that call as well. That will be done in a follow-on patch. Reported-by: Matthew Wilcox Signed-off-by: Jeff Layton Reviewed-by: Matthew Wilcox Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 6ea761c84494..b03dbaa9d345 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -962,9 +962,8 @@ retry: max_pages = wsize >> PAGE_SHIFT; get_more_pages: - pvec_pages = pagevec_lookup_range_nr_tag(&pvec, mapping, &index, - end, PAGECACHE_TAG_DIRTY, - max_pages - locked_pages); + pvec_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, + end, PAGECACHE_TAG_DIRTY); dout("pagevec_lookup_range_tag got %d\n", pvec_pages); if (!pvec_pages && !locked_pages) break; From 0b98acd6188309333c3a8a6e16feadadd31e4523 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 14 Sep 2020 13:39:19 +0200 Subject: [PATCH 10/27] libceph, rbd, ceph: "blacklist" -> "blocklist" Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.rst | 6 +++--- drivers/block/rbd.c | 8 ++++---- fs/ceph/addr.c | 24 ++++++++++++------------ fs/ceph/file.c | 4 ++-- fs/ceph/mds_client.c | 16 ++++++++-------- fs/ceph/super.c | 4 ++-- fs/ceph/super.h | 4 ++-- include/linux/ceph/mon_client.h | 2 +- include/linux/ceph/rados.h | 2 +- net/ceph/mon_client.c | 8 ++++---- 10 files changed, 39 insertions(+), 39 deletions(-) diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst index 0aa70750df0f..7d2ef4e27273 100644 --- a/Documentation/filesystems/ceph.rst +++ b/Documentation/filesystems/ceph.rst @@ -163,14 +163,14 @@ Mount Options to the default VFS implementation if this option is used. recover_session= - Set auto reconnect mode in the case where the client is blacklisted. The + Set auto reconnect mode in the case where the client is blocklisted. The available modes are "no" and "clean". The default is "no". * no: never attempt to reconnect when client detects that it has been - blacklisted. Operations will generally fail after being blacklisted. + blocklisted. Operations will generally fail after being blocklisted. * clean: client reconnects to the ceph cluster automatically when it - detects that it has been blacklisted. During reconnect, client drops + detects that it has been blocklisted. During reconnect, client drops dirty data/metadata, invalidates page caches and writable file handles. After reconnect, file locks become stale because the MDS loses track of them. If an inode contains any stale file locks, read/write on the diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index e77eaab5cf23..bdd33bcf11b1 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4010,10 +4010,10 @@ static int rbd_try_lock(struct rbd_device *rbd_dev) rbd_warn(rbd_dev, "breaking header lock owned by %s%llu", ENTITY_NAME(lockers[0].id.name)); - ret = ceph_monc_blacklist_add(&client->monc, + ret = ceph_monc_blocklist_add(&client->monc, &lockers[0].info.addr); if (ret) { - rbd_warn(rbd_dev, "blacklist of %s%llu failed: %d", + rbd_warn(rbd_dev, "blocklist of %s%llu failed: %d", ENTITY_NAME(lockers[0].id.name), ret); goto out; } @@ -4077,7 +4077,7 @@ static int rbd_try_acquire_lock(struct rbd_device *rbd_dev) ret = rbd_try_lock(rbd_dev); if (ret < 0) { rbd_warn(rbd_dev, "failed to lock header: %d", ret); - if (ret == -EBLACKLISTED) + if (ret == -EBLOCKLISTED) goto out; ret = 1; /* request lock anyway */ @@ -4613,7 +4613,7 @@ static void rbd_reregister_watch(struct work_struct *work) ret = __rbd_register_watch(rbd_dev); if (ret) { rbd_warn(rbd_dev, "failed to reregister watch: %d", ret); - if (ret != -EBLACKLISTED && ret != -ENOENT) { + if (ret != -EBLOCKLISTED && ret != -ENOENT) { queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, RBD_RETRY_DELAY); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b03dbaa9d345..7b1f3dad576f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -271,8 +271,8 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err < 0) { SetPageError(page); ceph_fscache_readpage_cancel(inode, page); - if (err == -EBLACKLISTED) - fsc->blacklisted = true; + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; goto out; } if (err < PAGE_SIZE) @@ -312,8 +312,8 @@ static void finish_read(struct ceph_osd_request *req) int i; dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - if (rc == -EBLACKLISTED) - ceph_inode_to_client(inode)->blacklisted = true; + if (rc == -EBLOCKLISTED) + ceph_inode_to_client(inode)->blocklisted = true; /* unlock all pages, zeroing any data we didn't read */ osd_data = osd_req_op_extent_osd_data(req, 0); @@ -737,8 +737,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) end_page_writeback(page); return err; } - if (err == -EBLACKLISTED) - fsc->blacklisted = true; + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; dout("writepage setting page/mapping error %d %p\n", err, page); mapping_set_error(&inode->i_data, err); @@ -801,8 +801,8 @@ static void writepages_finish(struct ceph_osd_request *req) if (rc < 0) { mapping_set_error(mapping, rc); ceph_set_error_write(ci); - if (rc == -EBLACKLISTED) - fsc->blacklisted = true; + if (rc == -EBLOCKLISTED) + fsc->blocklisted = true; } else { ceph_clear_error_write(ci); } @@ -2038,16 +2038,16 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, if (err >= 0 || err == -ENOENT) have |= POOL_READ; else if (err != -EPERM) { - if (err == -EBLACKLISTED) - fsc->blacklisted = true; + if (err == -EBLOCKLISTED) + fsc->blocklisted = true; goto out_unlock; } if (err2 == 0 || err2 == -EEXIST) have |= POOL_WRITE; else if (err2 != -EPERM) { - if (err2 == -EBLACKLISTED) - fsc->blacklisted = true; + if (err2 == -EBLOCKLISTED) + fsc->blocklisted = true; err = err2; goto out_unlock; } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 762a280b7037..209535d5b8d3 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -933,8 +933,8 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ceph_release_page_vector(pages, num_pages); if (ret < 0) { - if (ret == -EBLACKLISTED) - fsc->blacklisted = true; + if (ret == -EBLOCKLISTED) + fsc->blocklisted = true; break; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 76d8d9495d1d..bb2d938a17ac 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3303,7 +3303,7 @@ bad: } static int __decode_session_metadata(void **p, void *end, - bool *blacklisted) + bool *blocklisted) { /* map */ u32 n; @@ -3318,7 +3318,7 @@ static int __decode_session_metadata(void **p, void *end, ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); if (err_str && strnstr(*p, "blacklisted", len)) - *blacklisted = true; + *blocklisted = true; *p += len; } return 0; @@ -3341,7 +3341,7 @@ static void handle_session(struct ceph_mds_session *session, u32 op; u64 seq, features = 0; int wake = 0; - bool blacklisted = false; + bool blocklisted = false; /* decode */ ceph_decode_need(&p, end, sizeof(*h), bad); @@ -3354,7 +3354,7 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; /* version >= 2, metadata */ - if (__decode_session_metadata(&p, end, &blacklisted) < 0) + if (__decode_session_metadata(&p, end, &blocklisted) < 0) goto bad; /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); @@ -3445,8 +3445,8 @@ static void handle_session(struct ceph_mds_session *session, session->s_state = CEPH_MDS_SESSION_REJECTED; cleanup_session_requests(mdsc, session); remove_session_caps(session); - if (blacklisted) - mdsc->fsc->blacklisted = true; + if (blocklisted) + mdsc->fsc->blocklisted = true; wake = 2; /* for good measure */ break; @@ -4367,14 +4367,14 @@ static void maybe_recover_session(struct ceph_mds_client *mdsc) if (READ_ONCE(fsc->mount_state) != CEPH_MOUNT_MOUNTED) return; - if (!READ_ONCE(fsc->blacklisted)) + if (!READ_ONCE(fsc->blocklisted)) return; if (fsc->last_auto_reconnect && time_before(jiffies, fsc->last_auto_reconnect + HZ * 60 * 30)) return; - pr_info("auto reconnect after blacklisted\n"); + pr_info("auto reconnect after blocklisted\n"); fsc->last_auto_reconnect = jiffies; ceph_force_reconnect(fsc->sb); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index b3fc9bb61afc..2516304379d3 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1241,13 +1241,13 @@ int ceph_force_reconnect(struct super_block *sb) * see remove_session_caps_cb() */ flush_workqueue(fsc->inode_wq); - /* In case that we were blacklisted. This also reset + /* In case that we were blocklisted. This also reset * all mon/osd connections */ ceph_reset_client_addr(fsc->client); ceph_osdc_clear_abort_err(&fsc->client->osdc); - fsc->blacklisted = false; + fsc->blocklisted = false; fsc->mount_state = CEPH_MOUNT_MOUNTED; if (sb->s_root) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 483a52d281cd..582694899130 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -32,7 +32,7 @@ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) -#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blacklisted */ +#define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */ #define CEPH_MOUNT_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */ @@ -109,7 +109,7 @@ struct ceph_fs_client { unsigned long mount_state; unsigned long last_auto_reconnect; - bool blacklisted; + bool blocklisted; bool have_copy_from2; diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h index ce4ffeb384d7..b658961156a0 100644 --- a/include/linux/ceph/mon_client.h +++ b/include/linux/ceph/mon_client.h @@ -142,7 +142,7 @@ int ceph_monc_get_version(struct ceph_mon_client *monc, const char *what, int ceph_monc_get_version_async(struct ceph_mon_client *monc, const char *what, ceph_monc_callback_t cb, u64 private_data); -int ceph_monc_blacklist_add(struct ceph_mon_client *monc, +int ceph_monc_blocklist_add(struct ceph_mon_client *monc, struct ceph_entity_addr *client_addr); extern int ceph_monc_open_session(struct ceph_mon_client *monc); diff --git a/include/linux/ceph/rados.h b/include/linux/ceph/rados.h index 3a518fd0eaad..43a7a1573b51 100644 --- a/include/linux/ceph/rados.h +++ b/include/linux/ceph/rados.h @@ -424,7 +424,7 @@ enum { }; #define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/ -#define EBLACKLISTED ESHUTDOWN /* blacklisted */ +#define EBLOCKLISTED ESHUTDOWN /* blocklisted */ /* xattr comparison */ enum { diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index d633a0aeaa55..efcdde471278 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -896,7 +896,7 @@ bad: ceph_msg_dump(msg); } -int ceph_monc_blacklist_add(struct ceph_mon_client *monc, +int ceph_monc_blocklist_add(struct ceph_mon_client *monc, struct ceph_entity_addr *client_addr) { struct ceph_mon_generic_request *req; @@ -936,9 +936,9 @@ int ceph_monc_blacklist_add(struct ceph_mon_client *monc, ret = wait_generic_request(req); if (!ret) /* - * Make sure we have the osdmap that includes the blacklist + * Make sure we have the osdmap that includes the blocklist * entry. This is needed to ensure that the OSDs pick up the - * new blacklist before processing any future requests from + * new blocklist before processing any future requests from * this client. */ ret = ceph_wait_for_latest_osdmap(monc->client, 0); @@ -947,7 +947,7 @@ out: put_generic_request(req); return ret; } -EXPORT_SYMBOL(ceph_monc_blacklist_add); +EXPORT_SYMBOL(ceph_monc_blocklist_add); /* * Resend pending generic requests. From 1b05fae7f29db4c41864aed903865086e070fa89 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 Sep 2020 20:38:34 +0200 Subject: [PATCH 11/27] libceph: switch to the new "osd blocklist add" command Signed-off-by: Ilya Dryomov --- net/ceph/mon_client.c | 67 +++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 15 deletions(-) diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index efcdde471278..c4cf2529d08b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -896,8 +896,9 @@ bad: ceph_msg_dump(msg); } -int ceph_monc_blocklist_add(struct ceph_mon_client *monc, - struct ceph_entity_addr *client_addr) +static __printf(2, 0) +int do_mon_command_vargs(struct ceph_mon_client *monc, const char *fmt, + va_list ap) { struct ceph_mon_generic_request *req; struct ceph_mon_command *h; @@ -925,28 +926,64 @@ int ceph_monc_blocklist_add(struct ceph_mon_client *monc, h->monhdr.session_mon_tid = 0; h->fsid = monc->monmap->fsid; h->num_strs = cpu_to_le32(1); - len = sprintf(h->str, "{ \"prefix\": \"osd blacklist\", \ - \"blacklistop\": \"add\", \ - \"addr\": \"%pISpc/%u\" }", - &client_addr->in_addr, le32_to_cpu(client_addr->nonce)); + len = vsprintf(h->str, fmt, ap); h->str_len = cpu_to_le32(len); send_generic_request(monc, req); mutex_unlock(&monc->mutex); ret = wait_generic_request(req); - if (!ret) - /* - * Make sure we have the osdmap that includes the blocklist - * entry. This is needed to ensure that the OSDs pick up the - * new blocklist before processing any future requests from - * this client. - */ - ret = ceph_wait_for_latest_osdmap(monc->client, 0); - out: put_generic_request(req); return ret; } + +static __printf(2, 3) +int do_mon_command(struct ceph_mon_client *monc, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = do_mon_command_vargs(monc, fmt, ap); + va_end(ap); + return ret; +} + +int ceph_monc_blocklist_add(struct ceph_mon_client *monc, + struct ceph_entity_addr *client_addr) +{ + int ret; + + ret = do_mon_command(monc, + "{ \"prefix\": \"osd blocklist\", \ + \"blocklistop\": \"add\", \ + \"addr\": \"%pISpc/%u\" }", + &client_addr->in_addr, + le32_to_cpu(client_addr->nonce)); + if (ret == -EINVAL) { + /* + * The monitor returns EINVAL on an unrecognized command. + * Try the legacy command -- it is exactly the same except + * for the name. + */ + ret = do_mon_command(monc, + "{ \"prefix\": \"osd blacklist\", \ + \"blacklistop\": \"add\", \ + \"addr\": \"%pISpc/%u\" }", + &client_addr->in_addr, + le32_to_cpu(client_addr->nonce)); + } + if (ret) + return ret; + + /* + * Make sure we have the osdmap that includes the blocklist + * entry. This is needed to ensure that the OSDs pick up the + * new blocklist before processing any future requests from + * this client. + */ + return ceph_wait_for_latest_osdmap(monc->client, 0); +} EXPORT_SYMBOL(ceph_monc_blocklist_add); /* From 4bb926e83f1e2d80409cc8b45336f5f303d49315 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Tue, 15 Sep 2020 21:11:30 +0200 Subject: [PATCH 12/27] ceph: add a note explaining session reject error string error_string key in the metadata map of MClientSession message is intended for humans, but unfortunately became part of the on-wire format with the introduction of recover_session=clean mode in commit 131d7eb4faa1 ("ceph: auto reconnect after blacklisted"). Signed-off-by: Ilya Dryomov --- fs/ceph/mds_client.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bb2d938a17ac..08f1c0c31dc2 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -3317,6 +3317,10 @@ static int __decode_session_metadata(void **p, void *end, *p += len; ceph_decode_32_safe(p, end, len, bad); ceph_decode_need(p, end, len, bad); + /* + * Match "blocklisted (blacklisted)" from newer MDSes, + * or "blacklisted" from older MDSes. + */ if (err_str && strnstr(*p, "blacklisted", len)) *blocklisted = true; *p += len; From 18d620f063b0780db2a86343dcf3a18e363626b9 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 28 May 2020 13:56:54 -0400 Subject: [PATCH 13/27] ceph: break out writeback of incompatible snap context to separate function When dirtying a page, we have to flush incompatible contexts. Move the search for an incompatible context into a separate function, and fix up the caller to wait and retry if there is one. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 114 +++++++++++++++++++++++++++++-------------------- 1 file changed, 68 insertions(+), 46 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 7b1f3dad576f..f8b478237ea8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1298,6 +1298,62 @@ static int context_is_writeable_or_written(struct inode *inode, return ret; } +/** + * ceph_find_incompatible - find an incompatible context and return it + * @inode: inode associated with page + * @page: page being dirtied + * + * We are only allowed to write into/dirty a page if the page is + * clean, or already dirty within the same snap context. Returns a + * conflicting context if there is one, NULL if there isn't, or a + * negative error code on other errors. + * + * Must be called with page lock held. + */ +static struct ceph_snap_context * +ceph_find_incompatible(struct inode *inode, struct page *page) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + + if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { + dout(" page %p forced umount\n", page); + return ERR_PTR(-EIO); + } + + for (;;) { + struct ceph_snap_context *snapc, *oldest; + + wait_on_page_writeback(page); + + snapc = page_snap_context(page); + if (!snapc || snapc == ci->i_head_snapc) + break; + + /* + * this page is already dirty in another (older) snap + * context! is it writeable now? + */ + oldest = get_oldest_context(inode, NULL, NULL); + if (snapc->seq > oldest->seq) { + /* not writeable -- return it for the caller to deal with */ + ceph_put_snap_context(oldest); + dout(" page %p snapc %p not current or oldest\n", page, snapc); + return ceph_get_snap_context(snapc); + } + ceph_put_snap_context(oldest); + + /* yay, writeable, do it now (without dropping page lock) */ + dout(" page %p snapc %p not current, but oldest\n", page, snapc); + if (clear_page_dirty_for_io(page)) { + int r = writepage_nounlock(page, NULL); + if (r < 0) + return ERR_PTR(r); + } + } + return NULL; +} + /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. @@ -1311,61 +1367,27 @@ static int ceph_update_writeable_page(struct file *file, struct page *page) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; loff_t page_off = pos & PAGE_MASK; int pos_in_page = pos & ~PAGE_MASK; int end_in_page = pos_in_page + len; loff_t i_size; int r; - struct ceph_snap_context *snapc, *oldest; - - if (READ_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN) { - dout(" page %p forced umount\n", page); - unlock_page(page); - return -EIO; - } retry_locked: - /* writepages currently holds page lock, but if we change that later, */ - wait_on_page_writeback(page); - - snapc = page_snap_context(page); - if (snapc && snapc != ci->i_head_snapc) { - /* - * this page is already dirty in another (older) snap - * context! is it writeable now? - */ - oldest = get_oldest_context(inode, NULL, NULL); - if (snapc->seq > oldest->seq) { - ceph_put_snap_context(oldest); - dout(" page %p snapc %p not current or oldest\n", - page, snapc); - /* - * queue for writeback, and wait for snapc to - * be writeable or written - */ - snapc = ceph_get_snap_context(snapc); - unlock_page(page); - ceph_queue_writeback(inode); - r = wait_event_killable(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - if (r == -ERESTARTSYS) - return r; - return -EAGAIN; - } - ceph_put_snap_context(oldest); - - /* yay, writeable, do it now (without dropping page lock) */ - dout(" page %p snapc %p not current, but oldest\n", - page, snapc); - if (!clear_page_dirty_for_io(page)) - goto retry_locked; - r = writepage_nounlock(page, NULL); - if (r < 0) + snapc = ceph_find_incompatible(inode, page); + if (snapc) { + if (IS_ERR(snapc)) { + r = PTR_ERR(snapc); goto fail_unlock; - goto retry_locked; + } + unlock_page(page); + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + return -EAGAIN; } if (PageUptodate(page)) { From d45156bf46c082d8164d93ca0dec9e7ac808dbdc Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 28 May 2020 14:59:49 -0400 Subject: [PATCH 14/27] ceph: don't call ceph_update_writeable_page from page_mkwrite page_mkwrite should only be called with Uptodate pages, so we should only need to flush incompatible snap contexts. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f8b478237ea8..c2c23b468d13 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1300,7 +1300,6 @@ static int context_is_writeable_or_written(struct inode *inode, /** * ceph_find_incompatible - find an incompatible context and return it - * @inode: inode associated with page * @page: page being dirtied * * We are only allowed to write into/dirty a page if the page is @@ -1311,8 +1310,9 @@ static int context_is_writeable_or_written(struct inode *inode, * Must be called with page lock held. */ static struct ceph_snap_context * -ceph_find_incompatible(struct inode *inode, struct page *page) +ceph_find_incompatible(struct page *page) { + struct inode *inode = page->mapping->host; struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); @@ -1376,7 +1376,7 @@ static int ceph_update_writeable_page(struct file *file, int r; retry_locked: - snapc = ceph_find_incompatible(inode, page); + snapc = ceph_find_incompatible(page); if (snapc) { if (IS_ERR(snapc)) { r = PTR_ERR(snapc); @@ -1689,6 +1689,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) inode_inc_iversion_raw(inode); do { + struct ceph_snap_context *snapc; + lock_page(page); if (page_mkwrite_check_truncate(page, inode) < 0) { @@ -1697,13 +1699,26 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) break; } - err = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (err >= 0) { + snapc = ceph_find_incompatible(page); + if (!snapc) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; + break; } - } while (err == -EAGAIN); + + unlock_page(page); + + if (IS_ERR(snapc)) { + ret = VM_FAULT_SIGBUS; + break; + } + + ceph_queue_writeback(inode); + err = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + } while (err == 0); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { From 9b4862ecaec5487ba9b192cc70d6fa0c7aef373a Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 14 May 2020 12:05:45 -0400 Subject: [PATCH 15/27] ceph: fold ceph_sync_readpages into ceph_readpage Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 78 ++++++++++++++++---------------------------------- 1 file changed, 25 insertions(+), 53 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c2c23b468d13..5493a5205a5f 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -182,58 +182,15 @@ static int ceph_releasepage(struct page *page, gfp_t g) return !PagePrivate(page); } -/* - * Read some contiguous pages. If we cross a stripe boundary, shorten - * *plen. Return number of bytes read, or error. - */ -static int ceph_sync_readpages(struct ceph_fs_client *fsc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - u64 off, u64 *plen, - u32 truncate_seq, u64 truncate_size, - struct page **pages, int num_pages, - int page_align) -{ - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - int rc = 0; - - dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino, - vino.snap, off, *plen); - req = ceph_osdc_new_request(osdc, layout, vino, off, plen, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, truncate_seq, truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); - - /* it may be a short read due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, - pages, *plen, page_align, false, false); - - dout("readpages final extent is %llu~%llu (%llu bytes align %d)\n", - off, *plen, *plen, page_align); - - rc = ceph_osdc_start_request(osdc, req, false); - if (!rc) - rc = ceph_osdc_wait_request(osdc, req); - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - - ceph_osdc_put_request(req); - dout("readpages result %d\n", rc); - return rc; -} - -/* - * read a single page, without unlocking it. - */ +/* read a single page, without unlocking it. */ static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; + struct ceph_vino vino = ceph_vino(inode); int err = 0; u64 off = page_offset(page); u64 len = PAGE_SIZE; @@ -260,12 +217,27 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err == 0) return -EINPROGRESS; - dout("readpage inode %p file %p page %p index %lu\n", - inode, filp, page, page->index); - err = ceph_sync_readpages(fsc, ceph_vino(inode), - &ci->i_layout, off, &len, - ci->i_truncate_seq, ci->i_truncate_size, - &page, 1, 0); + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", + vino.ino, vino.snap, filp, off, len, page, page->index); + req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, + CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, + ci->i_truncate_seq, ci->i_truncate_size, + false); + if (IS_ERR(req)) + return PTR_ERR(req); + + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + + err = ceph_osdc_start_request(osdc, req, false); + if (!err) + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + ceph_osdc_put_request(req); + dout("readpage result %d\n", err); + if (err == -ENOENT) err = 0; if (err < 0) { From 6390987f2f4c5dcacb3ef7b9cb2ef5b8fdca3e10 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 14 Jul 2020 14:37:15 -0400 Subject: [PATCH 16/27] ceph: fold ceph_sync_writepages into writepage_nounlock Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 93 +++++++++++++++++++------------------------------- 1 file changed, 35 insertions(+), 58 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 5493a5205a5f..72cbaac68256 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -591,50 +591,6 @@ static u64 get_writepages_data_length(struct inode *inode, return end > start ? end - start : 0; } -/* - * do a synchronous write on N pages - */ -static int ceph_sync_writepages(struct ceph_fs_client *fsc, - struct ceph_vino vino, - struct ceph_file_layout *layout, - struct ceph_snap_context *snapc, - u64 off, u64 len, - u32 truncate_seq, u64 truncate_size, - struct timespec64 *mtime, - struct page **pages, int num_pages) -{ - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - int rc = 0; - int page_align = off & ~PAGE_MASK; - - req = ceph_osdc_new_request(osdc, layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, truncate_seq, truncate_size, - true); - if (IS_ERR(req)) - return PTR_ERR(req); - - /* it may be a short write due to an object boundary */ - osd_req_op_extent_osd_data_pages(req, 0, pages, len, page_align, - false, false); - dout("writepages %llu~%llu (%llu bytes)\n", off, len, len); - - req->r_mtime = *mtime; - rc = ceph_osdc_start_request(osdc, req, true); - if (!rc) - rc = ceph_osdc_wait_request(osdc, req); - - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); - - ceph_osdc_put_request(req); - if (rc == 0) - rc = len; - dout("writepages result %d\n", rc); - return rc; -} - /* * Write a single page, but leave the page locked. * @@ -643,20 +599,19 @@ static int ceph_sync_writepages(struct ceph_fs_client *fsc, */ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) { - struct inode *inode; - struct ceph_inode_info *ci; - struct ceph_fs_client *fsc; + struct inode *inode = page->mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); - int err, len = PAGE_SIZE; + int err; + loff_t len = PAGE_SIZE; struct ceph_writeback_ctl ceph_wbc; + struct ceph_osd_client *osdc = &fsc->client->osdc; + struct ceph_osd_request *req; dout("writepage %p idx %lu\n", page, page->index); - inode = page->mapping->host; - ci = ceph_inode(inode); - fsc = ceph_inode_to_client(inode); - /* verify this is a writeable snap context */ snapc = page_snap_context(page); if (!snapc) { @@ -685,7 +640,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (ceph_wbc.i_size < page_off + len) len = ceph_wbc.i_size - page_off; - dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n", + dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n", inode, page, page->index, page_off, len, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > @@ -693,11 +648,33 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); set_page_writeback(page); - err = ceph_sync_writepages(fsc, ceph_vino(inode), - &ci->i_layout, snapc, page_off, len, - ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, - &inode->i_mtime, &page, 1); + req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc, + ceph_wbc.truncate_seq, ceph_wbc.truncate_size, + true); + if (IS_ERR(req)) { + redirty_page_for_writepage(wbc, page); + end_page_writeback(page); + return PTR_ERR(req); + } + + /* it may be a short write due to an object boundary */ + WARN_ON_ONCE(len > PAGE_SIZE); + osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); + + req->r_mtime = inode->i_mtime; + err = ceph_osdc_start_request(osdc, req, true); + if (!err) + err = ceph_osdc_wait_request(osdc, req); + + ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); + + ceph_osdc_put_request(req); + if (err == 0) + err = len; + if (err < 0) { struct writeback_control tmp_wbc; if (!wbc) From 1cc1699070bd8f42111b92e5c8018bd7d52f0003 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 5 Jun 2020 09:05:17 -0400 Subject: [PATCH 17/27] ceph: fold ceph_update_writeable_page into ceph_write_begin ...and reorganize the loop for better clarity. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 152 +++++++++++++++++++++---------------------------- 1 file changed, 66 insertions(+), 86 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 72cbaac68256..97827f68a3e7 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1303,78 +1303,6 @@ ceph_find_incompatible(struct page *page) return NULL; } -/* - * We are only allowed to write into/dirty the page if the page is - * clean, or already dirty within the same snap context. - * - * called with page locked. - * return success with page locked, - * or any failure (incl -EAGAIN) with page unlocked. - */ -static int ceph_update_writeable_page(struct file *file, - loff_t pos, unsigned len, - struct page *page) -{ - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc; - loff_t page_off = pos & PAGE_MASK; - int pos_in_page = pos & ~PAGE_MASK; - int end_in_page = pos_in_page + len; - loff_t i_size; - int r; - -retry_locked: - snapc = ceph_find_incompatible(page); - if (snapc) { - if (IS_ERR(snapc)) { - r = PTR_ERR(snapc); - goto fail_unlock; - } - unlock_page(page); - ceph_queue_writeback(inode); - r = wait_event_killable(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - return -EAGAIN; - } - - if (PageUptodate(page)) { - dout(" page %p already uptodate\n", page); - return 0; - } - - /* full page? */ - if (pos_in_page == 0 && len == PAGE_SIZE) - return 0; - - /* past end of file? */ - i_size = i_size_read(inode); - - if (page_off >= i_size || - (pos_in_page == 0 && (pos+len) >= i_size && - end_in_page - pos_in_page != PAGE_SIZE)) { - dout(" zeroing %p 0 - %d and %d - %d\n", - page, pos_in_page, end_in_page, (int)PAGE_SIZE); - zero_user_segments(page, - 0, pos_in_page, - end_in_page, PAGE_SIZE); - return 0; - } - - /* we need to read it. */ - r = ceph_do_readpage(file, page); - if (r < 0) { - if (r == -EINPROGRESS) - return -EAGAIN; - goto fail_unlock; - } - goto retry_locked; -fail_unlock: - unlock_page(page); - return r; -} - /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. @@ -1384,26 +1312,78 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); - struct page *page; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct page *page = NULL; pgoff_t index = pos >> PAGE_SHIFT; - int r; + int pos_in_page = pos & ~PAGE_MASK; + int r = 0; - do { - /* get a page */ + dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); + + for (;;) { page = grab_cache_page_write_begin(mapping, index, 0); - if (!page) - return -ENOMEM; + if (!page) { + r = -ENOMEM; + break; + } - dout("write_begin file %p inode %p page %p %d~%d\n", file, - inode, page, (int)pos, (int)len); - - r = ceph_update_writeable_page(file, pos, len, page); - if (r < 0) + snapc = ceph_find_incompatible(page); + if (snapc) { + if (IS_ERR(snapc)) { + r = PTR_ERR(snapc); + break; + } + unlock_page(page); put_page(page); - else - *pagep = page; - } while (r == -EAGAIN); + page = NULL; + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + if (r != 0) + break; + continue; + } + if (PageUptodate(page)) { + dout(" page %p already uptodate\n", page); + break; + } + + /* + * In some cases we don't need to read at all: + * - full page write + * - write that lies completely beyond EOF + * - write that covers the the page from start to EOF or beyond it + */ + if ((pos_in_page == 0 && len == PAGE_SIZE) || + (pos >= i_size_read(inode)) || + (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { + zero_user_segments(page, 0, pos_in_page, + pos_in_page + len, PAGE_SIZE); + break; + } + + /* + * We need to read it. If we get back -EINPROGRESS, then the page was + * handed off to fscache and it will be unlocked when the read completes. + * Refind the page in that case so we can reacquire the page lock. Otherwise + * we got a hard error or the read was completed synchronously. + */ + r = ceph_do_readpage(file, page); + if (r != -EINPROGRESS) + break; + } + + if (r < 0) { + if (page) { + unlock_page(page); + put_page(page); + } + } else { + *pagep = page; + } return r; } From f6fbdcd997f5d5d0658204ca42abdeced56fd4e5 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Thu, 17 Sep 2020 18:45:44 +0200 Subject: [PATCH 18/27] ceph: mark ceph_fmt_xattr() as printf-like for better type checking Signed-off-by: Ilya Dryomov --- fs/ceph/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 3a733ac33d9b..197cb1234341 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -116,7 +116,8 @@ static ssize_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val, * NULL terminates however, so call it on a temporary buffer and then memcpy * the result into place. */ -static int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) +static __printf(3, 4) +int ceph_fmt_xattr(char *val, size_t size, const char *fmt, ...) { int ret; va_list args; From 7edf1ec5b249cb7f9b85b7257f638026fd1a5d2b Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 1 Oct 2020 13:40:49 -0400 Subject: [PATCH 19/27] ceph: don't SetPageError on readpage errors PageError really only has meaning within a particular subsystem. Nothing looks at this bit in the core kernel code, and ceph itself doesn't care about it. Don't bother setting the PageError bit on error. Signed-off-by: Jeff Layton Reviewed-by: Matthew Wilcox (Oracle) Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 97827f68a3e7..137c0a5a2a0d 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -241,7 +241,6 @@ static int ceph_do_readpage(struct file *filp, struct page *page) if (err == -ENOENT) err = 0; if (err < 0) { - SetPageError(page); ceph_fscache_readpage_cancel(inode, page); if (err == -EBLOCKLISTED) fsc->blocklisted = true; From c403c3a2fbe24d4ed33e10cabad048583ebd4edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sun, 4 Oct 2020 19:04:24 +0100 Subject: [PATCH 20/27] ceph: promote to unsigned long long before shifting On 32-bit systems, this shift will overflow for files larger than 4GB. Cc: stable@vger.kernel.org Fixes: 61f68816211e ("ceph: check caps in filemap_fault and page_mkwrite") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 137c0a5a2a0d..35c83f65475b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1471,7 +1471,7 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct page *pinned_page = NULL; - loff_t off = vmf->pgoff << PAGE_SHIFT; + loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; From 5231198089afba422dd2c46fef515291fca53bdd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 30 Mar 2020 13:07:23 -0400 Subject: [PATCH 21/27] ceph: drop separate mdsc argument from __send_cap We can get it from the session if we need it. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c7e69547628e..32937bec4a25 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1454,8 +1454,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, * * Caller should hold snap_rwsem (read), s_mutex. */ -static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg, - struct ceph_inode_info *ci) +static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; int ret; @@ -1467,7 +1466,7 @@ static void __send_cap(struct ceph_mds_client *mdsc, struct cap_msg_args *arg, ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); } @@ -2147,7 +2146,7 @@ ack: want, retain, flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - __send_cap(mdsc, &arg, ci); + __send_cap(&arg, ci); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -2221,7 +2220,7 @@ retry_locked: flushing, flush_tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - __send_cap(mdsc, &arg, ci); + __send_cap(&arg, ci); } else { if (!list_empty(&ci->i_cap_flush_list)) { struct ceph_cap_flush *cf = @@ -2435,7 +2434,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc, (cap->issued | cap->implemented), cf->caps, cf->tid, oldest_flush_tid); spin_unlock(&ci->i_ceph_lock); - __send_cap(mdsc, &arg, ci); + __send_cap(&arg, ci); } else { struct ceph_cap_snap *capsnap = container_of(cf, struct ceph_cap_snap, From 16d68903f56ae277446cc2d24ab18db835363eda Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 30 Mar 2020 07:20:27 -0400 Subject: [PATCH 22/27] ceph: break up send_cap_msg Push the allocation of the msg and the send into the caller. Rename the function to encode_cap_msg and make it void return. Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 60 +++++++++++++++++++++++--------------------------- 1 file changed, 28 insertions(+), 32 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 32937bec4a25..0874ac445cdc 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1222,36 +1222,27 @@ struct cap_msg_args { }; /* - * Build and send a cap message to the given MDS. - * - * Caller should be holding s_mutex. + * cap struct size + flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ -static int send_cap_msg(struct cap_msg_args *arg) +#define CAP_MSG_SIZE (sizeof(struct ceph_mds_caps) + \ + 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4) + +/* Marshal up the cap msg to the MDS */ +static void encode_cap_msg(struct ceph_msg *msg, struct cap_msg_args *arg) { struct ceph_mds_caps *fc; - struct ceph_msg *msg; void *p; - size_t extra_len; struct ceph_osd_client *osdc = &arg->session->s_mdsc->fsc->client->osdc; - dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" - " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op), - arg->cid, arg->ino, ceph_cap_string(arg->caps), - ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty), - arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid, - arg->mseq, arg->follows, arg->size, arg->max_size, - arg->xattr_version, + dout("%s %s %llx %llx caps %s wanted %s dirty %s seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu xattr_ver %llu xattr_len %d\n", + __func__, ceph_cap_op_name(arg->op), arg->cid, arg->ino, + ceph_cap_string(arg->caps), ceph_cap_string(arg->wanted), + ceph_cap_string(arg->dirty), arg->seq, arg->issue_seq, + arg->flush_tid, arg->oldest_flush_tid, arg->mseq, arg->follows, + arg->size, arg->max_size, arg->xattr_version, arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size + - * osd_epoch_barrier + oldest_flush_tid */ - extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4; - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, - GFP_NOFS, false); - if (!msg) - return -ENOMEM; - msg->hdr.version = cpu_to_le16(10); msg->hdr.tid = cpu_to_le64(arg->flush_tid); @@ -1323,9 +1314,6 @@ static int send_cap_msg(struct cap_msg_args *arg) /* Advisory flags (version 10) */ ceph_encode_32(&p, arg->flags); - - ceph_con_send(&arg->session->s_con, msg); - return 0; } /* @@ -1456,22 +1444,23 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, */ static void __send_cap(struct cap_msg_args *arg, struct ceph_inode_info *ci) { + struct ceph_msg *msg; struct inode *inode = &ci->vfs_inode; - int ret; - ret = send_cap_msg(arg); - if (ret < 0) { - pr_err("error sending cap msg, ino (%llx.%llx) " - "flushing %s tid %llu, requeue\n", + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + if (!msg) { + pr_err("error allocating cap msg: ino (%llx.%llx) flushing %s tid %llu, requeuing cap.\n", ceph_vinop(inode), ceph_cap_string(arg->dirty), arg->flush_tid); spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(arg->session->s_mdsc, ci); spin_unlock(&ci->i_ceph_lock); + return; } + encode_cap_msg(msg, arg); + ceph_con_send(&arg->session->s_con, msg); ceph_buffer_put(arg->old_xattr_buf); - if (arg->wake) wake_up_all(&ci->i_cap_wq); } @@ -1482,6 +1471,11 @@ static inline int __send_flush_snap(struct inode *inode, u32 mseq, u64 oldest_flush_tid) { struct cap_msg_args arg; + struct ceph_msg *msg; + + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, CAP_MSG_SIZE, GFP_NOFS, false); + if (!msg) + return -ENOMEM; arg.session = session; arg.ino = ceph_vino(inode).ino; @@ -1520,7 +1514,9 @@ static inline int __send_flush_snap(struct inode *inode, arg.flags = 0; arg.wake = false; - return send_cap_msg(&arg); + encode_cap_msg(msg, &arg); + ceph_con_send(&arg.session->s_con, msg); + return 0; } /* From c74d79af9002a9cb179f7950a2c266f0de964abe Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 6 Oct 2020 12:24:19 -0400 Subject: [PATCH 23/27] ceph: comment cleanups and clarifications Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 16 ++++++++++++++++ fs/ceph/mds_client.h | 2 +- fs/ceph/super.h | 3 ++- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 0874ac445cdc..5027bbdca419 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1922,12 +1922,24 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, retry: spin_lock(&ci->i_ceph_lock); retry_locked: + /* Caps wanted by virtue of active open files. */ file_wanted = __ceph_caps_file_wanted(ci); + + /* Caps which have active references against them */ used = __ceph_caps_used(ci); + + /* + * "issued" represents the current caps that the MDS wants us to have. + * "implemented" is the set that we have been granted, and includes the + * ones that have not yet been returned to the MDS (the "revoking" set, + * usually because they have outstanding references). + */ issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; want = file_wanted; + + /* The ones we currently want to retain (may be adjusted below) */ retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { if (file_wanted) { @@ -2005,6 +2017,10 @@ retry_locked: /* NOTE: no side-effects allowed, until we take s_mutex */ + /* + * If we have an auth cap, we don't need to consider any + * overlapping caps as used. + */ cap_used = used; if (ci->i_auth_cap && cap != ci->i_auth_cap) cap_used &= ~ci->i_auth_cap->issued; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 658800605bfb..cbf8af437140 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -393,7 +393,7 @@ struct ceph_mds_client { struct ceph_mds_session **sessions; /* NULL for mds if no session */ atomic_t num_sessions; - int max_sessions; /* len of s_mds_sessions */ + int max_sessions; /* len of sessions array */ int stopping; /* true if shutting down */ atomic64_t quotarealms_count; /* # realms with quota */ diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 582694899130..482473e4cce1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -160,7 +160,8 @@ struct ceph_cap { int issued; /* latest, from the mds */ int implemented; /* implemented superset of issued (for revocation) */ - int mds, mds_wanted; + int mds; /* mds index for this cap */ + int mds_wanted; /* caps wanted from this mds */ }; /* caps to release */ struct { From 5a5036c89f345769c0049f4af04cc6647c0df058 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 2 Oct 2020 14:15:59 +0200 Subject: [PATCH 24/27] libceph: move a dout in queue_con_delay() The queued con->work can start executing (and therefore logging) before we get to this "con->work has been queued" message, making the logs confusing. Move it up, with the meaning of "con->work is about to be queued". Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d4d7a0e52491..8eabcdc2af56 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2811,13 +2811,13 @@ static int queue_con_delay(struct ceph_connection *con, unsigned long delay) return -ENOENT; } + dout("%s %p %lu\n", __func__, con, delay); if (!queue_delayed_work(ceph_msgr_wq, &con->work, delay)) { dout("%s %p - already queued\n", __func__, con); con->ops->put(con); return -EBUSY; } - dout("%s %p %lu\n", __func__, con, delay); return 0; } From b07720d0bd1e7c2251642010efb6075dbee23bb8 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Fri, 2 Oct 2020 14:38:08 +0200 Subject: [PATCH 25/27] libceph: fix ENTITY_NAME format suggestion Signed-off-by: Ilya Dryomov --- include/linux/ceph/messenger.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 76371aaae2d1..60b324efd1c4 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -54,7 +54,7 @@ struct ceph_connection_operations { int (*check_message_signature) (struct ceph_msg *msg); }; -/* use format string %s%d */ +/* use format string %s%lld */ #define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num) struct ceph_messenger { From a9dfe31e5ce31022b8f063b30d2907872cbab447 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sat, 3 Oct 2020 11:52:15 +0200 Subject: [PATCH 26/27] libceph: format ceph_entity_addr nonces as unsigned Match the server side logs. Signed-off-by: Ilya Dryomov --- net/ceph/messenger.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 8eabcdc2af56..02a195e013b7 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2016,11 +2016,11 @@ static int process_banner(struct ceph_connection *con) sizeof(con->peer_addr)) != 0 && !(addr_is_blank(&con->actual_peer_addr) && con->actual_peer_addr.nonce == con->peer_addr.nonce)) { - pr_warn("wrong peer, want %s/%d, got %s/%d\n", + pr_warn("wrong peer, want %s/%u, got %s/%u\n", ceph_pr_addr(&con->peer_addr), - (int)le32_to_cpu(con->peer_addr.nonce), + le32_to_cpu(con->peer_addr.nonce), ceph_pr_addr(&con->actual_peer_addr), - (int)le32_to_cpu(con->actual_peer_addr.nonce)); + le32_to_cpu(con->actual_peer_addr.nonce)); con->error_msg = "wrong peer at address"; return -1; } From 28e1581c3b4ea5f98530064a103c6217bedeea73 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Wed, 7 Oct 2020 20:06:48 +0200 Subject: [PATCH 27/27] libceph: clear con->out_msg on Policy::stateful_server faults con->out_msg must be cleared on Policy::stateful_server (!CEPH_MSG_CONNECT_LOSSY) faults. Not doing so botches the reconnection attempt, because after writing the banner the messenger moves on to writing the data section of that message (either from where it got interrupted by the connection reset or from the beginning) instead of writing struct ceph_msg_connect. This results in a bizarre error message because the server sends CEPH_MSGR_TAG_BADPROTOVER but we think we wrote struct ceph_msg_connect: libceph: mds0 (1)172.21.15.45:6828 socket error on write ceph: mds0 reconnect start libceph: mds0 (1)172.21.15.45:6829 socket closed (con state OPEN) libceph: mds0 (1)172.21.15.45:6829 protocol version mismatch, my 32 != server's 32 libceph: mds0 (1)172.21.15.45:6829 protocol version mismatch AFAICT this bug goes back to the dawn of the kernel client. The reason it survived for so long is that only MDS sessions are stateful and only two MDS messages have a data section: CEPH_MSG_CLIENT_RECONNECT (always, but reconnecting is rare) and CEPH_MSG_CLIENT_REQUEST (only when xattrs are involved). The connection has to get reset precisely when such message is being sent -- in this case it was the former. Cc: stable@vger.kernel.org Link: https://tracker.ceph.com/issues/47723 Signed-off-by: Ilya Dryomov Reviewed-by: Jeff Layton --- net/ceph/messenger.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 02a195e013b7..af0f1fa24937 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -2998,6 +2998,11 @@ static void con_fault(struct ceph_connection *con) ceph_msg_put(con->in_msg); con->in_msg = NULL; } + if (con->out_msg) { + BUG_ON(con->out_msg->con != con); + ceph_msg_put(con->out_msg); + con->out_msg = NULL; + } /* Requeue anything that hasn't been acked */ list_splice_init(&con->out_sent, &con->out_queue);