From 2d7c86a8f9cdce1408c4f3c69d94d007eff2f179 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:50 +0530 Subject: [PATCH 01/13] libceph: generalize addr/ip parsing based on delimiter ... and remove hardcoded function name in ceph_parse_ips(). [ idryomov: delim parameter, drop CEPH_ADDR_PARSE_DEFAULT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 3 ++- fs/ceph/super.c | 2 +- include/linux/ceph/libceph.h | 2 +- include/linux/ceph/messenger.h | 2 +- net/ceph/ceph_common.c | 9 ++++----- net/ceph/messenger.c | 15 ++++++++------- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 953fa134cd3d..909dbe6111bf 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6497,7 +6497,8 @@ static int rbd_add_parse_args(const char *buf, pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT; pctx.opts->trim = RBD_TRIM_DEFAULT; - ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL); + ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL, + ','); if (ret) goto out_err; diff --git a/fs/ceph/super.c b/fs/ceph/super.c index bab61232dc5a..c444371ebc38 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -272,7 +272,7 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dout("server path '%s'\n", fsopt->server_path); ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log); + pctx->copts, fc->log.log, ','); if (ret) return ret; diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..c72285d8594e 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -301,7 +301,7 @@ struct fs_parameter; struct fc_log; struct ceph_options *ceph_alloc_options(void); int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l); + struct fc_log *l, char delim); int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, struct fc_log *l); int ceph_print_client_options(struct seq_file *m, struct ceph_client *client, diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 0e6e9ad3c3bf..ff99ce094cfa 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -532,7 +532,7 @@ extern const char *ceph_pr_addr(const struct ceph_entity_addr *addr); extern int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count); + int max_count, int *count, char delim); extern int ceph_msgr_init(void); extern void ceph_msgr_exit(void); diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..851b0c4c5730 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -422,14 +422,14 @@ out: } int ceph_parse_mon_ips(const char *buf, size_t len, struct ceph_options *opt, - struct fc_log *l) + struct fc_log *l, char delim) { struct p_log log = {.prefix = "libceph", .log = l}; int ret; - /* ip1[:port1][,ip2[:port2]...] */ + /* ip1[:port1][ip2[:port2]...] */ ret = ceph_parse_ips(buf, buf + len, opt->mon_addr, CEPH_MAX_MON, - &opt->num_mon); + &opt->num_mon, delim); if (ret) { error_plog(&log, "Failed to parse monitor IPs: %d", ret); return ret; @@ -455,8 +455,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, case Opt_ip: err = ceph_parse_ips(param->string, param->string + param->size, - &opt->my_addr, - 1, NULL); + &opt->my_addr, 1, NULL, ','); if (err) { error_plog(&log, "Failed to parse ip: %d", err); return err; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..929ed91f2ec3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1267,30 +1267,31 @@ static int ceph_parse_server_name(const char *name, size_t namelen, */ int ceph_parse_ips(const char *c, const char *end, struct ceph_entity_addr *addr, - int max_count, int *count) + int max_count, int *count, char delim) { int i, ret = -EINVAL; const char *p = c; dout("parse_ips on '%.*s'\n", (int)(end-c), c); for (i = 0; i < max_count; i++) { + char cur_delim = delim; const char *ipend; int port; - char delim = ','; if (*p == '[') { - delim = ']'; + cur_delim = ']'; p++; } - ret = ceph_parse_server_name(p, end - p, &addr[i], delim, &ipend); + ret = ceph_parse_server_name(p, end - p, &addr[i], cur_delim, + &ipend); if (ret) goto bad; ret = -EINVAL; p = ipend; - if (delim == ']') { + if (cur_delim == ']') { if (*p != ']') { dout("missing matching ']'\n"); goto bad; @@ -1326,11 +1327,11 @@ int ceph_parse_ips(const char *c, const char *end, addr[i].type = CEPH_ENTITY_ADDR_TYPE_LEGACY; addr[i].nonce = 0; - dout("parse_ips got %s\n", ceph_pr_addr(&addr[i])); + dout("%s got %s\n", __func__, ceph_pr_addr(&addr[i])); if (p == end) break; - if (*p != ',') + if (*p != delim) goto bad; p++; } From 4153c7fc937a2afa077dbdb9fe3189b9981f423c Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:51 +0530 Subject: [PATCH 02/13] libceph: rename parse_fsid() to ceph_parse_fsid() and export ... as it is too generic. also, use __func__ when logging rather than hardcoding the function name. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- include/linux/ceph/libceph.h | 1 + net/ceph/ceph_common.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index c72285d8594e..644f224eccf7 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -296,6 +296,7 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern void *ceph_kvmalloc(size_t size, gfp_t flags); +extern int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid); struct fs_parameter; struct fc_log; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 851b0c4c5730..decae43b4262 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -217,14 +217,14 @@ void *ceph_kvmalloc(size_t size, gfp_t flags) return p; } -static int parse_fsid(const char *str, struct ceph_fsid *fsid) +int ceph_parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; char tmp[3]; int err = -EINVAL; int d; - dout("parse_fsid '%s'\n", str); + dout("%s '%s'\n", __func__, str); tmp[2] = 0; while (*str && i < 16) { if (ispunct(*str)) { @@ -244,9 +244,10 @@ static int parse_fsid(const char *str, struct ceph_fsid *fsid) if (i == 16) err = 0; - dout("parse_fsid ret %d got fsid %pU\n", err, fsid); + dout("%s ret %d got fsid %pU\n", __func__, err, fsid); return err; } +EXPORT_SYMBOL(ceph_parse_fsid); /* * ceph options @@ -464,7 +465,7 @@ int ceph_parse_param(struct fs_parameter *param, struct ceph_options *opt, break; case Opt_fsid: - err = parse_fsid(param->string, &opt->fsid); + err = ceph_parse_fsid(param->string, &opt->fsid); if (err) { error_plog(&log, "Failed to parse fsid: %d", err); return err; From 7b19b4db5add8d9f50e854907a82a10ba4d27c42 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:52 +0530 Subject: [PATCH 03/13] ceph: new device mount syntax Old mount device syntax (source) has the following problems: - mounts to the same cluster but with different fsnames and/or creds have identical device string which can confuse xfstests. - Userspace mount helper tool resolves monitor addresses and fill in mon addrs automatically, but that means the device shown in /proc/mounts is different than what was used for mounting. New device syntax is as follows: cephuser@fsid.mycephfs2=/path Note, there is no "monitor address" in the device string. That gets passed in as mount option. This keeps the device string same when monitor addresses change (on remounts). Also note that the userspace mount helper tool is backward compatible. I.e., the mount helper will fallback to using old syntax after trying to mount with the new syntax. [ idryomov: drop CEPH_MON_ADDR_MNTOPT_DELIM ] Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 141 ++++++++++++++++++++++++++++++++++++++++++++---- fs/ceph/super.h | 3 ++ 2 files changed, 134 insertions(+), 10 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c444371ebc38..e2c40d055711 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -146,6 +146,7 @@ enum { Opt_mds_namespace, Opt_recover_session, Opt_source, + Opt_mon_addr, /* string args above */ Opt_dirstat, Opt_rbytes, @@ -197,6 +198,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_u32 ("rsize", Opt_rsize), fsparam_string ("snapdirname", Opt_snapdirname), fsparam_string ("source", Opt_source), + fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), {} @@ -228,9 +230,92 @@ static void canonicalize_path(char *path) } /* - * Parse the source parameter. Distinguish the server list from the path. + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + +static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + int r; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + if (*dev_name_end != ':') + return invalfc(fc, "separator ':' missing in source"); + + r = ceph_parse_mon_ips(dev_name, dev_name_end - dev_name, + pctx->copts, fc->log.log, ','); + if (r) + return r; + + fsopt->new_dev_syntax = false; + return 0; +} + +static int ceph_parse_new_source(const char *dev_name, const char *dev_name_end, + struct fs_context *fc) +{ + size_t len; + struct ceph_fsid fsid; + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + char *fsid_start, *fs_name_start; + + if (*dev_name_end != '=') { + dout("separator '=' missing in source"); + return -EINVAL; + } + + fsid_start = strchr(dev_name, '@'); + if (!fsid_start) + return invalfc(fc, "missing cluster fsid"); + ++fsid_start; /* start of cluster fsid */ + + fs_name_start = strchr(fsid_start, '.'); + if (!fs_name_start) + return invalfc(fc, "missing file system name"); + + if (ceph_parse_fsid(fsid_start, &fsid)) + return invalfc(fc, "Invalid FSID"); + + ++fs_name_start; /* start of file system name */ + len = dev_name_end - fs_name_start; + + if (!namespace_equals(fsopt, fs_name_start, len)) + return invalfc(fc, "Mismatching mds_namespace"); + kfree(fsopt->mds_namespace); + fsopt->mds_namespace = kstrndup(fs_name_start, len, GFP_KERNEL); + if (!fsopt->mds_namespace) + return -ENOMEM; + dout("file system (mds namespace) '%s'\n", fsopt->mds_namespace); + + fsopt->new_dev_syntax = true; + return 0; +} + +/* + * Parse the source parameter for new device format. Distinguish the device + * spec from the path. Try parsing new device format and fallback to old + * format if needed. * - * The source will look like: + * New device syntax will looks like: + * =/ + * where + * is name@fsid.fsname + * is optional, but if present must begin with '/' + * (monitor addresses are passed via mount option) + * + * Old device syntax is: * [,...]:[] * where * is [:] @@ -263,24 +348,44 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) dev_name_end = dev_name + strlen(dev_name); } - dev_name_end--; /* back up to ':' separator */ - if (dev_name_end < dev_name || *dev_name_end != ':') - return invalfc(fc, "No path or : separator in source"); + dev_name_end--; /* back up to separator */ + if (dev_name_end < dev_name) + return invalfc(fc, "Path missing in source"); dout("device name '%.*s'\n", (int)(dev_name_end - dev_name), dev_name); if (fsopt->server_path) dout("server path '%s'\n", fsopt->server_path); - ret = ceph_parse_mon_ips(param->string, dev_name_end - dev_name, - pctx->copts, fc->log.log, ','); - if (ret) - return ret; + dout("trying new device syntax"); + ret = ceph_parse_new_source(dev_name, dev_name_end, fc); + if (ret) { + if (ret != -EINVAL) + return ret; + dout("trying old device syntax"); + ret = ceph_parse_old_source(dev_name, dev_name_end, fc); + if (ret) + return ret; + } fc->source = param->string; param->string = NULL; return 0; } +static int ceph_parse_mon_addr(struct fs_parameter *param, + struct fs_context *fc) +{ + struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; + + kfree(fsopt->mon_addr); + fsopt->mon_addr = param->string; + param->string = NULL; + + return ceph_parse_mon_ips(fsopt->mon_addr, strlen(fsopt->mon_addr), + pctx->copts, fc->log.log, '/'); +} + static int ceph_parse_mount_param(struct fs_context *fc, struct fs_parameter *param) { @@ -306,6 +411,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, param->string = NULL; break; case Opt_mds_namespace: + if (!namespace_equals(fsopt, param->string, strlen(param->string))) + return invalfc(fc, "Mismatching mds_namespace"); kfree(fsopt->mds_namespace); fsopt->mds_namespace = param->string; param->string = NULL; @@ -323,6 +430,8 @@ static int ceph_parse_mount_param(struct fs_context *fc, if (fc->source) return invalfc(fc, "Multiple sources specified"); return ceph_parse_source(param, fc); + case Opt_mon_addr: + return ceph_parse_mon_addr(param, fc); case Opt_wsize: if (result.uint_32 < PAGE_SIZE || result.uint_32 > CEPH_MAX_WRITE_SIZE) @@ -474,6 +583,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) kfree(args->mds_namespace); kfree(args->server_path); kfree(args->fscache_uniq); + kfree(args->mon_addr); kfree(args); } @@ -517,6 +627,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, if (ret) return ret; + ret = strcmp_null(fsopt1->mon_addr, fsopt2->mon_addr); + if (ret) + return ret; + return ceph_compare_options(new_opt, fsc->client); } @@ -572,9 +686,13 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if ((fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM) == 0) seq_puts(m, ",copyfrom"); - if (fsopt->mds_namespace) + /* dump mds_namespace when old device syntax is in use */ + if (fsopt->mds_namespace && !fsopt->new_dev_syntax) seq_show_option(m, "mds_namespace", fsopt->mds_namespace); + if (fsopt->mon_addr) + seq_printf(m, ",mon_addr=%s", fsopt->mon_addr); + if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); @@ -1060,6 +1178,7 @@ static int ceph_setup_bdi(struct super_block *sb, struct ceph_fs_client *fsc) static int ceph_get_tree(struct fs_context *fc) { struct ceph_parse_opts_ctx *pctx = fc->fs_private; + struct ceph_mount_options *fsopt = pctx->opts; struct super_block *sb; struct ceph_fs_client *fsc; struct dentry *res; @@ -1071,6 +1190,8 @@ static int ceph_get_tree(struct fs_context *fc) if (!fc->source) return invalfc(fc, "No source"); + if (fsopt->new_dev_syntax && !fsopt->mon_addr) + return invalfc(fc, "No monitor address"); /* create client (which we may/may not use) */ fsc = create_fs_client(pctx->opts, pctx->copts); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ac331aa07cfa..ec6b221e5b62 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -89,6 +89,8 @@ struct ceph_mount_options { unsigned int max_readdir; /* max readdir result (entries) */ unsigned int max_readdir_bytes; /* max readdir result (bytes) */ + bool new_dev_syntax; + /* * everything above this point can be memcmp'd; everything below * is handled in compare_mount_options() @@ -98,6 +100,7 @@ struct ceph_mount_options { char *mds_namespace; /* default NULL */ char *server_path; /* default NULL (means "/") */ char *fscache_uniq; /* default NULL */ + char *mon_addr; }; struct ceph_fs_client { From 2167f2cc686a97911a0b06ba9c97cec304b7c432 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:53 +0530 Subject: [PATCH 04/13] ceph: record updated mon_addr on remount Note that the new monitors are just shown in /proc/mounts. Ceph does not (re)connect to new monitors yet. [ jlayton: s/printk\(KERN_NOTICE/pr_notice(/ s/strcmp/strcmp_null/ ] Signed-off-by: Venky Shankar Signed-off-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index e2c40d055711..31b5786cd98f 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1277,6 +1277,13 @@ static int ceph_reconfigure_fc(struct fs_context *fc) else ceph_clear_mount_opt(fsc, ASYNC_DIROPS); + if (strcmp_null(fsc->mount_options->mon_addr, fsopt->mon_addr)) { + kfree(fsc->mount_options->mon_addr); + fsc->mount_options->mon_addr = fsopt->mon_addr; + fsopt->mon_addr = NULL; + pr_notice("ceph: monitor addresses recorded, but not used for reconnection"); + } + sync_filesystem(fc->root->d_sb); return 0; } From e1b9eb50763d108166651ca67aae09893332c6b0 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 14 Jul 2021 15:35:54 +0530 Subject: [PATCH 05/13] doc: document new CephFS mount device syntax Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- Documentation/filesystems/ceph.rst | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/Documentation/filesystems/ceph.rst b/Documentation/filesystems/ceph.rst index 7d2ef4e27273..4942e018db85 100644 --- a/Documentation/filesystems/ceph.rst +++ b/Documentation/filesystems/ceph.rst @@ -82,7 +82,7 @@ Mount Syntax The basic mount syntax is:: - # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt + # mount -t ceph user@fsid.fs_name=/[subdir] mnt -o mon_addr=monip1[:port][/monip2[:port]] You only need to specify a single monitor, as the client will get the full list when it connects. (However, if the monitor you specify @@ -90,16 +90,35 @@ happens to be down, the mount won't succeed.) The port can be left off if the monitor is using the default. So if the monitor is at 1.2.3.4:: - # mount -t ceph 1.2.3.4:/ /mnt/ceph + # mount -t ceph cephuser@07fe3187-00d9-42a3-814b-72a4d5e7d5be.cephfs=/ /mnt/ceph -o mon_addr=1.2.3.4 is sufficient. If /sbin/mount.ceph is installed, a hostname can be -used instead of an IP address. +used instead of an IP address and the cluster FSID can be left out +(as the mount helper will fill it in by reading the ceph configuration +file):: + # mount -t ceph cephuser@cephfs=/ /mnt/ceph -o mon_addr=mon-addr +Multiple monitor addresses can be passed by separating each address with a slash (`/`):: + + # mount -t ceph cephuser@cephfs=/ /mnt/ceph -o mon_addr=192.168.1.100/192.168.1.101 + +When using the mount helper, monitor address can be read from ceph +configuration file if available. Note that, the cluster FSID (passed as part +of the device string) is validated by checking it with the FSID reported by +the monitor. Mount Options ============= + mon_addr=ip_address[:port][/ip_address[:port]] + Monitor address to the cluster. This is used to bootstrap the + connection to the cluster. Once connection is established, the + monitor addresses in the monitor map are followed. + + fsid=cluster-id + FSID of the cluster (from `ceph fsid` command). + ip=A.B.C.D[:N] Specify the IP and/or port the client should bind to locally. There is normally not much reason to do this. If the IP is not From adbed05ed62d1f3b6f6c5cb88ec52c1ffafc0fd9 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Wed, 3 Nov 2021 10:30:39 +0530 Subject: [PATCH 06/13] ceph: mount syntax module parameter Add read-only module parameters for supported mount syntaxes. Primary user is the user-space mount helper for catching v2 syntax bugs during testing by cross verifying if the kernel supports v2 syntax on mount failure. Signed-off-by: Venky Shankar Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 31b5786cd98f..2166fc2c1625 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1461,6 +1461,14 @@ bool disable_send_metrics = false; module_param_cb(disable_send_metrics, ¶m_ops_metrics, &disable_send_metrics, 0644); MODULE_PARM_DESC(disable_send_metrics, "Enable sending perf metrics to ceph cluster (default: on)"); +/* for both v1 and v2 syntax */ +static bool mount_support = true; +static const struct kernel_param_ops param_ops_mount_syntax = { + .get = param_get_bool, +}; +module_param_cb(mount_syntax_v1, ¶m_ops_mount_syntax, &mount_support, 0444); +module_param_cb(mount_syntax_v2, ¶m_ops_mount_syntax, &mount_support, 0444); + module_init(init_ceph); module_exit(exit_ceph); From 8e55ba8caae5cd380b1c9c81a426602a667e110e Mon Sep 17 00:00:00 2001 From: Kotresh HR Date: Wed, 10 Nov 2021 23:30:21 +0530 Subject: [PATCH 07/13] ceph: Fix incorrect statfs report for small quota Problem: The statfs reports incorrect free/available space for quota less then CEPH_BLOCK size (4M). Solution: For quota less than CEPH_BLOCK size, smaller block size of 4K is used. But if quota is less than 4K, it is decided to go with binary use/free of 4K block. For quota size less than 4K size, report the total=used=4K,free=0 when quota is full and total=free=4K,used=0 otherwise. Signed-off-by: Kotresh HR Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 14 ++++++++++++++ fs/ceph/super.h | 1 + 2 files changed, 15 insertions(+) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 620c691af40e..24ae13ea2241 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -494,10 +494,24 @@ bool ceph_quota_update_statfs(struct ceph_fs_client *fsc, struct kstatfs *buf) if (ci->i_max_bytes) { total = ci->i_max_bytes >> CEPH_BLOCK_SHIFT; used = ci->i_rbytes >> CEPH_BLOCK_SHIFT; + /* For quota size less than 4MB, use 4KB block size */ + if (!total) { + total = ci->i_max_bytes >> CEPH_4K_BLOCK_SHIFT; + used = ci->i_rbytes >> CEPH_4K_BLOCK_SHIFT; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } /* It is possible for a quota to be exceeded. * Report 'zero' in that case */ free = total > used ? total - used : 0; + /* For quota size less than 4KB, report the + * total=used=4KB,free=0 when quota is full + * and total=free=4KB, used=0 otherwise */ + if (!total) { + total = 1; + free = ci->i_max_bytes > ci->i_rbytes ? 1 : 0; + buf->f_frsize = 1 << CEPH_4K_BLOCK_SHIFT; + } } spin_unlock(&ci->i_ceph_lock); if (total) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index ec6b221e5b62..533189b537b2 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -32,6 +32,7 @@ * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) +#define CEPH_4K_BLOCK_SHIFT 12 /* 4 KB */ #define CEPH_MOUNT_OPT_CLEANRECOVER (1<<1) /* auto reonnect (clean mode) after blocklisted */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ From 435a120a47eed0b3a1ac7b86cf1f7707bf2242ce Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sat, 27 Nov 2021 17:21:04 +0000 Subject: [PATCH 08/13] rbd: make const pointer spaces a static const array Don't populate the const array spaces on the stack but make it static const and make the pointer an array to remove a dereference. Shrinks object code a little too. Also clean up intent, currently it is spaces and should be a tab. Signed-off-by: Colin Ian King Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- drivers/block/rbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 909dbe6111bf..1bf1595420a8 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -6191,7 +6191,7 @@ static inline size_t next_token(const char **buf) * These are the characters that produce nonzero for * isspace() in the "C" and "POSIX" locales. */ - const char *spaces = " \f\n\r\t\v"; + static const char spaces[] = " \f\n\r\t\v"; *buf += strspn(*buf, spaces); /* Find start of token */ From af9ceae83cd26c9319bb2cdab23bb16d39300cbd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 26 Jan 2021 13:41:38 -0500 Subject: [PATCH 09/13] ceph: drop send metrics debug message This pops every second and isn't very useful. Signed-off-by: Jeff Layton Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/metric.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index c57699d8408d..0fcba68f9a99 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -160,8 +160,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, msg->hdr.version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("client%llu send metrics to mds%d\n", - ceph_client_gid(mdsc->fsc->client), s->s_mds); ceph_con_send(&s->s_con, msg); return true; From 0078ea3b0566e3da09ae8e1e4fbfd708702f2876 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 9 Nov 2021 09:54:49 -0500 Subject: [PATCH 10/13] ceph: don't check for quotas on MDS stray dirs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 玮文 胡 reported seeing the WARN_RATELIMIT pop when writing to an inode that had been transplanted into the stray dir. The client was trying to look up the quotarealm info from the parent and that tripped the warning. Change the ceph_vino_is_reserved helper to not throw a warning for MDS stray directories (0x100 - 0x1ff), only for reserved dirs that are not in that range. Also, fix ceph_has_realms_with_quotas to return false when encountering a reserved inode. URL: https://tracker.ceph.com/issues/53180 Reported-by: Hu Weiwen Signed-off-by: Jeff Layton Reviewed-by: Luis Henriques Reviewed-by: Xiubo Li Signed-off-by: Ilya Dryomov --- fs/ceph/quota.c | 3 +++ fs/ceph/super.h | 20 ++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/ceph/quota.c b/fs/ceph/quota.c index 24ae13ea2241..a338a3ec0dc4 100644 --- a/fs/ceph/quota.c +++ b/fs/ceph/quota.c @@ -30,6 +30,9 @@ static inline bool ceph_has_realms_with_quotas(struct inode *inode) /* if root is the real CephFS root, we don't have quota realms */ if (root && ceph_ino(root) == CEPH_INO_ROOT) return false; + /* MDS stray dirs have no quota realms */ + if (ceph_vino_is_reserved(ceph_inode(inode)->i_vino)) + return false; /* otherwise, we can't know for sure */ return true; } diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 533189b537b2..5ec465a1350a 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -539,19 +539,23 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) * * These come from src/mds/mdstypes.h in the ceph sources. */ -#define CEPH_MAX_MDS 0x100 -#define CEPH_NUM_STRAY 10 +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 #define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_MDS_INO_LOG_OFFSET (2 * CEPH_MAX_MDS) #define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) { - if (vino.ino < CEPH_INO_SYSTEM_BASE && - vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { - WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); - return true; - } - return false; + if (vino.ino >= CEPH_INO_SYSTEM_BASE || + vino.ino < CEPH_MDS_INO_MDSDIR_OFFSET) + return false; + + /* Don't warn on mdsdirs */ + WARN_RATELIMIT(vino.ino >= CEPH_MDS_INO_LOG_OFFSET, + "Attempt to access reserved inode number 0x%llx", + vino.ino); + return true; } static inline struct inode *ceph_find_inode(struct super_block *sb, From 94cc0877cad0bc6ca84686c4fa874bf530eb8b88 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 30 Nov 2021 14:12:13 -0500 Subject: [PATCH 11/13] ceph: add new "nopagecache" option CephFS is a bit unlike most other filesystems in that it only conditionally does buffered I/O based on the caps that it gets from the MDS. In most cases, unless there is contended access for an inode the MDS does give Fbc caps to the client, so the unbuffered codepaths are only infrequently traveled and are difficult to test. At one time, the "-o sync" mount option would give you this behavior, but that was removed in commit 7ab9b3807097 ("ceph: Don't use ceph-sync-mode for synchronous-fs."). Add a new mount option to tell the client to ignore Fbc caps when doing I/O, and to use the synchronous codepaths exclusively, even on non-O_DIRECT file descriptors. We already have an ioctl that forces this behavior on a per-file basis, so we can just always set the CEPH_F_SYNC flag in the file description on such mounts. Additionally, this patch also changes the client to not request Fbc when doing direct I/O. We aren't using the cache with O_DIRECT so we don't have any need for those caps. Signed-off-by: Jeff Layton Acked-by: Greg Farnum Reviewed-by: Venky Shankar Signed-off-by: Ilya Dryomov --- fs/ceph/file.c | 24 +++++++++++++++--------- fs/ceph/super.c | 11 +++++++++++ fs/ceph/super.h | 1 + 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/fs/ceph/file.c b/fs/ceph/file.c index c138e8126286..7de5db51c3d0 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -204,6 +204,8 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, int fmode, bool isdir) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mount_options *opt = + ceph_inode_to_client(&ci->vfs_inode)->mount_options; struct ceph_file_info *fi; dout("%s %p %p 0%o (%s)\n", __func__, inode, file, @@ -225,6 +227,9 @@ static int ceph_init_file_info(struct inode *inode, struct file *file, if (!fi) return -ENOMEM; + if (opt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + fi->flags |= CEPH_F_SYNC; + file->private_data = fi; } @@ -1536,7 +1541,7 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) struct ceph_inode_info *ci = ceph_inode(inode); bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; - int want, got = 0; + int want = 0, got = 0; int retry_op = 0, read = 0; again: @@ -1551,13 +1556,14 @@ again: else ceph_start_io_read(inode); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_CACHE; + want |= CEPH_CAP_FILE_LAZYIO; + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { - if (iocb->ki_flags & IOCB_DIRECT) + if (direct_lock) ceph_end_io_direct(inode); else ceph_end_io_read(inode); @@ -1691,7 +1697,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; - int err, want, got; + int err, want = 0, got; bool direct_lock = false; u32 map_flags; u64 pool_flags; @@ -1766,10 +1772,10 @@ retry_snap: dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", inode, ceph_vinop(inode), pos, count, i_size_read(inode)); + if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) + want |= CEPH_CAP_FILE_BUFFER; if (fi->fmode & CEPH_FILE_MODE_LAZY) - want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; - else - want = CEPH_CAP_FILE_BUFFER; + want |= CEPH_CAP_FILE_LAZYIO; got = 0; err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 2166fc2c1625..eb255d24e321 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -160,6 +160,7 @@ enum { Opt_quotadf, Opt_copyfrom, Opt_wsync, + Opt_pagecache, }; enum ceph_recover_session_mode { @@ -201,6 +202,7 @@ static const struct fs_parameter_spec ceph_mount_parameters[] = { fsparam_string ("mon_addr", Opt_mon_addr), fsparam_u32 ("wsize", Opt_wsize), fsparam_flag_no ("wsync", Opt_wsync), + fsparam_flag_no ("pagecache", Opt_pagecache), {} }; @@ -564,6 +566,12 @@ static int ceph_parse_mount_param(struct fs_context *fc, else fsopt->flags |= CEPH_MOUNT_OPT_ASYNC_DIROPS; break; + case Opt_pagecache: + if (result.negated) + fsopt->flags |= CEPH_MOUNT_OPT_NOPAGECACHE; + else + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPAGECACHE; + break; default: BUG(); } @@ -699,6 +707,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) seq_puts(m, ",wsync"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPAGECACHE) + seq_puts(m, ",nopagecache"); + if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); if (fsopt->rsize != CEPH_MAX_READ_SIZE) diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 5ec465a1350a..0d489a973d5e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -46,6 +46,7 @@ #define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */ #define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */ #define CEPH_MOUNT_OPT_ASYNC_DIROPS (1<<15) /* allow async directory ops */ +#define CEPH_MOUNT_OPT_NOPAGECACHE (1<<16) /* bypass pagecache altogether */ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ From 76bdbc7ac777adb6bc316bfe3f57b3de93c50985 Mon Sep 17 00:00:00 2001 From: Xiubo Li Date: Thu, 6 Jan 2022 09:35:52 +0800 Subject: [PATCH 12/13] ceph: remove redundant Lsx caps check The newcaps has already included the Ls, no need to check it again. Signed-off-by: Xiubo Li Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov --- fs/ceph/caps.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c447fa2e2d1f..62448691608f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -3375,8 +3375,7 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_LINK_SHARED) && (extra_info->issued & CEPH_CAP_LINK_EXCL) == 0) { set_nlink(inode, le32_to_cpu(grant->nlink)); - if (inode->i_nlink == 0 && - (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) + if (inode->i_nlink == 0) deleted_inode = true; } From a0b3a15eab6bc2e90008460b646d53e7d9dcdbbb Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 10 Jan 2022 18:28:33 -0500 Subject: [PATCH 13/13] ceph: move CEPH_SUPER_MAGIC definition to magic.h The uapi headers are missing the ceph definition. Move it there so userland apps can ID cephfs. Signed-off-by: Jeff Layton Reviewed-by: Ilya Dryomov Signed-off-by: Ilya Dryomov --- fs/ceph/super.c | 2 ++ fs/ceph/super.h | 3 --- include/uapi/linux/magic.h | 1 + 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index eb255d24e321..dbcf9b743c88 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -27,6 +27,8 @@ #include #include +#include + static DEFINE_SPINLOCK(ceph_fsc_lock); static LIST_HEAD(ceph_fsc_list); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 0d489a973d5e..dc9fe7ed3fa8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -25,9 +25,6 @@ #include #endif -/* f_type in struct statfs */ -#define CEPH_SUPER_MAGIC 0x00c36400 - /* large granularity for statfs utilization stats to facilitate * large volume sizes on 32-bit machines. */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */ diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 35687dcb1a42..53a3c20394cf 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -6,6 +6,7 @@ #define AFFS_SUPER_MAGIC 0xadff #define AFS_SUPER_MAGIC 0x5346414F #define AUTOFS_SUPER_MAGIC 0x0187 +#define CEPH_SUPER_MAGIC 0x00c36400 #define CODA_SUPER_MAGIC 0x73757245 #define CRAMFS_MAGIC 0x28cd3d45 /* some random number */ #define CRAMFS_MAGIC_WEND 0x453dcd28 /* magic number with the wrong endianess */